From 65fd2d47db4c07c6619d3a734b2c058ad24dfdf8 Mon Sep 17 00:00:00 2001
From: john bowen <bowen36@llnl.gov>
Date: Tue, 5 Nov 2024 11:31:19 -0800
Subject: [PATCH] Clang format

---
 .clang-format                              |    7 +-
 benchmark/benchmark-atomic.cpp             |  626 ++--
 benchmark/host-device-lambda-benchmark.cpp |   23 +-
 benchmark/ltimes.cpp                       | 2987 ++++++++++----------
 benchmark/raja_view_blur.cpp               |  231 +-
 cmake/RAJAMacros.cmake                     |   16 +-
 6 files changed, 1988 insertions(+), 1902 deletions(-)
diff --git a/.clang-format b/.clang-format
index b6fa54b233..898f791bf7 100644
--- a/.clang-format
+++ b/.clang-format
@@ -31,7 +31,7 @@ BraceWrapping:
   AfterExternBlock: false
   BeforeCatch:     true
   BeforeElse:      true
-  BeforeLambdaBody: true 
+  BeforeLambdaBody: true
   IndentBraces:    false
   SplitEmptyFunction: false
   SplitEmptyRecord: false
@@ -40,12 +40,13 @@ BraceWrapping:
 # Pointer alignment
 DerivePointerAlignment: false
 PointerAlignment: Left
+
+# Single line config
 AllowShortIfStatementsOnASingleLine : true
 AllowShortFunctionsOnASingleLine : true
 AllowShortLoopsOnASingleLine : false
 AllowAllArgumentsOnNextLine : true
 AllowAllParametersOfDeclarationOnNextLine : false
-AlignTrailingComments : true
 BinPackArguments : true
 BinPackParameters : false
 ConstructorInitializerAllOnOneLineOrOnePerLine : true
@@ -67,4 +68,4 @@ SpacesInCStyleCastParentheses: false
 SpacesInContainerLiterals: false
 SpacesInConditionalStatement: false
 SpacesInParentheses: false
-SpacesInSquareBrackets: false
+SpacesInSquareBrackets: false
\ No newline at end of file
diff --git a/benchmark/benchmark-atomic.cpp b/benchmark/benchmark-atomic.cpp
index ebe3a858ff..ecb316b451 100644
--- a/benchmark/benchmark-atomic.cpp
+++ b/benchmark/benchmark-atomic.cpp
@@ -5,15 +5,16 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // This file is intended to provide an interface for comparing the performance
-// of RAJA's atomic implementations with Desul's atomic implementations.  In order
-// to accomplish this without requiring two separate build system configurations
-// this file directly includes "desul/atomics.hpp" and directly calls desul namespace
-// atomics.  This is different from how a typical RAJA user would call a desul atomic.
+// of RAJA's atomic implementations with Desul's atomic implementations.  In
+// order to accomplish this without requiring two separate build system
+// configurations this file directly includes "desul/atomics.hpp" and directly
+// calls desul namespace atomics.  This is different from how a typical RAJA
+// user would call a desul atomic.
 
 #include "RAJA/RAJA.hpp"
 #include "RAJA/util/for_each.hpp"
 #include "RAJA/util/Timer.hpp"
-#if defined (RAJA_ENABLE_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
 #include "desul/atomics.hpp"
@@ -25,85 +26,99 @@
 
 
 /// Conditional compilation for CUDA benchmarks.
-#if defined (RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_CUDA)
 #include "RAJA/policy/cuda/atomic.hpp"
 
-template<int BLOCK_SZ>
-struct ExecPolicyGPU {
-    using policy = RAJA::cuda_exec<BLOCK_SZ, false>;
-    static std::string PolicyName() {
-        std::stringstream ss;
-        ss << "CUDA execution with block size " << BLOCK_SZ;
-        return ss.str();
-    }
+template <int BLOCK_SZ>
+struct ExecPolicyGPU
+{
+  using policy = RAJA::cuda_exec<BLOCK_SZ, false>;
+  static std::string PolicyName()
+  {
+    std::stringstream ss;
+    ss << "CUDA execution with block size " << BLOCK_SZ;
+    return ss.str();
+  }
 };
 
-struct GPUAtomic {
-    using policy = RAJA::policy::cuda::cuda_atomic;
+struct GPUAtomic
+{
+  using policy = RAJA::policy::cuda::cuda_atomic;
 };
 
-#elif defined (RAJA_ENABLE_HIP)
+#elif defined(RAJA_ENABLE_HIP)
 #include "RAJA/policy/hip/atomic.hpp"
 
-template<int BLOCK_SZ>
-struct ExecPolicyGPU {
-    using policy = RAJA::hip_exec<BLOCK_SZ, false>;
-    static std::string PolicyName() {
-        std::stringstream ss;
-        ss << "HIP execution with block size " << BLOCK_SZ;
-        return ss.str();
-    }
+template <int BLOCK_SZ>
+struct ExecPolicyGPU
+{
+  using policy = RAJA::hip_exec<BLOCK_SZ, false>;
+  static std::string PolicyName()
+  {
+    std::stringstream ss;
+    ss << "HIP execution with block size " << BLOCK_SZ;
+    return ss.str();
+  }
 };
 
-struct GPUAtomic {
-    using policy = RAJA::policy::hip::hip_atomic;
+struct GPUAtomic
+{
+  using policy = RAJA::policy::hip::hip_atomic;
 };
 
 #endif
 
 #define BLOCK_SZ 256
-#define INDENT "  "
+#define INDENT   "  "
 using raja_default_desul_order = desul::MemoryOrderRelaxed;
 using raja_default_desul_scope = desul::MemoryScopeDevice;
 
-// Desul atomics have a different signature than RAJA's built in ops. The following code provides some
-// helper function templates so that they can be called using the same signature in timing code.
+// Desul atomics have a different signature than RAJA's built in ops. The
+// following code provides some helper function templates so that they can be
+// called using the same signature in timing code.
 
 // Struct holding Desul atomic signature typedef
-template<typename ReturnType, typename... Args>
-struct DesulAtomicSignature {
-    using type = ReturnType(*)(Args..., raja_default_desul_order, raja_default_desul_scope);
+template <typename ReturnType, typename... Args>
+struct DesulAtomicSignature
+{
+  using type = ReturnType (*)(Args...,
+                              raja_default_desul_order,
+                              raja_default_desul_scope);
 };
 
 // Struct holding RAJA atomic signature typedef
-template<typename AtomicType>
-struct RajaAtomicSignature {
-    using type = AtomicType(*)(AtomicType*, const AtomicType);
+template <typename AtomicType>
+struct RajaAtomicSignature
+{
+  using type = AtomicType (*)(AtomicType*, const AtomicType);
 };
 
-/// RAJA::atomicAdd and other RAJA namespace atomic calls are overloaded and have an ambiguous type
-/// so they can't be passed as a template parameter.
-/// The following macro disambiguates the call and provide a signature comaptible with the DESUL
-/// wrapper. AtomicOperation must be a valid RAJA namespace atomic operation, like atomicAdd,
-/// atomicMax, etc.
-#define OPERATOR_CALL_BINARY(AtomicOperation)                                               \
-    template<typename ArgType>                                                              \
-    RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType val) const {            \
-        return RAJA::AtomicOperation(Policy {}, acc, val);                                  \
-    }                                                                                       \
-
-#define OPERATOR_CALL_UNARY(AtomicOperation)                                                \
-    template<typename ArgType>                                                              \
-    RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType) const {                \
-        return RAJA::AtomicOperation(Policy {}, acc);                                       \
-    }                                                                                       \
-
-#define DECLARE_ATOMIC_WRAPPER(AtomicFunctorName, AtomicOperatorDeclaration)                \
-template<typename Policy>                                                                   \
-struct AtomicFunctorName {                                                                  \
-    const char* name = #AtomicFunctorName ;                                                 \
-    AtomicOperatorDeclaration                                                               \
-};                                                                                          \
+/// RAJA::atomicAdd and other RAJA namespace atomic calls are overloaded and
+/// have an ambiguous type so they can't be passed as a template parameter. The
+/// following macro disambiguates the call and provide a signature comaptible
+/// with the DESUL wrapper. AtomicOperation must be a valid RAJA namespace
+/// atomic operation, like atomicAdd, atomicMax, etc.
+#define OPERATOR_CALL_BINARY(AtomicOperation)                                  \
+  template <typename ArgType>                                                  \
+  RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType val) const   \
+  {                                                                            \
+    return RAJA::AtomicOperation(Policy {}, acc, val);                         \
+  }
+
+#define OPERATOR_CALL_UNARY(AtomicOperation)                                   \
+  template <typename ArgType>                                                  \
+  RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType) const       \
+  {                                                                            \
+    return RAJA::AtomicOperation(Policy {}, acc);                              \
+  }
+
+#define DECLARE_ATOMIC_WRAPPER(AtomicFunctorName, AtomicOperatorDeclaration)   \
+  template <typename Policy>                                                   \
+  struct AtomicFunctorName                                                     \
+  {                                                                            \
+    const char* name = #AtomicFunctorName;                                     \
+    AtomicOperatorDeclaration                                                  \
+  };
 
 DECLARE_ATOMIC_WRAPPER(AtomicAdd, OPERATOR_CALL_BINARY(atomicAdd))
 DECLARE_ATOMIC_WRAPPER(AtomicSub, OPERATOR_CALL_BINARY(atomicSub))
@@ -119,228 +134,365 @@ DECLARE_ATOMIC_WRAPPER(AtomicXor, OPERATOR_CALL_BINARY(atomicXor))
 DECLARE_ATOMIC_WRAPPER(AtomicExchange, OPERATOR_CALL_BINARY(atomicExchange))
 DECLARE_ATOMIC_WRAPPER(AtomicLoad, OPERATOR_CALL_UNARY(atomicLoad))
 
-/// Instead of complicating the above macro to handle these two atomics, do the declarations
-/// manually below.
-template<typename Policy>
-struct AtomicStore {
-    const char* name = "AtomicStore";
-    template<typename ArgType>
-    RAJA_HOST_DEVICE void operator()(ArgType* acc, const ArgType val) const {
-        return RAJA::atomicStore(Policy {}, acc, val);
-    }
+/// Instead of complicating the above macro to handle these two atomics, do the
+/// declarations manually below.
+template <typename Policy>
+struct AtomicStore
+{
+  const char* name = "AtomicStore";
+  template <typename ArgType>
+  RAJA_HOST_DEVICE void operator()(ArgType* acc, const ArgType val) const
+  {
+    return RAJA::atomicStore(Policy {}, acc, val);
+  }
 };
 
-template<typename Policy>
-struct AtomicCAS {
-    const char* name = "AtomicCAS";
-    template<typename ArgType>
-    RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, ArgType compare) const {
-        return RAJA::atomicCAS(Policy {}, acc, compare, ArgType(1));
-    }
+template <typename Policy>
+struct AtomicCAS
+{
+  const char* name = "AtomicCAS";
+  template <typename ArgType>
+  RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, ArgType compare) const
+  {
+    return RAJA::atomicCAS(Policy {}, acc, compare, ArgType(1));
+  }
 };
 
 /// ExecPolicy wrapper for OpenMP
-struct ExecPolicyOMP {
-    using policy = RAJA::omp_parallel_for_exec;;
-    static std::string PolicyName() {
-        std::stringstream ss;
-        ss << "OpenMP execution";
-        return ss.str();
-    }
+struct ExecPolicyOMP
+{
+  using policy = RAJA::omp_parallel_for_exec;
+  ;
+  static std::string PolicyName()
+  {
+    std::stringstream ss;
+    ss << "OpenMP execution";
+    return ss.str();
+  }
 };
 
-/// Functor wrapping the desul implementation.  Wrapping the desul call ensure an identical signature with
-/// RAJA's implementations.  Wrapping the call in an functor allows simple type deduction for printing
-/// from within the benchmark.
-template<typename T, typename ReturnType, typename DesulAtomicSignature<ReturnType, T*, const T, T>::type atomic_impl>
-struct atomicWrapperDesulTernary {
-    /// Call operator overload template that allows invoking DESUL atomic with a (int*)(T*, T) signature
-    RAJA_HOST_DEVICE ReturnType operator()(T * acc, T value) const {
-        return atomic_impl(acc, value, T(1), raja_default_desul_order{},
-                        raja_default_desul_scope{});
-    }
+/// Functor wrapping the desul implementation.  Wrapping the desul call ensure
+/// an identical signature with RAJA's implementations.  Wrapping the call in an
+/// functor allows simple type deduction for printing from within the benchmark.
+template <
+    typename T,
+    typename ReturnType,
+    typename DesulAtomicSignature<ReturnType, T*, const T, T>::type atomic_impl>
+struct atomicWrapperDesulTernary
+{
+  /// Call operator overload template that allows invoking DESUL atomic with a
+  /// (int*)(T*, T) signature
+  RAJA_HOST_DEVICE ReturnType operator()(T* acc, T value) const
+  {
+    return atomic_impl(acc, value, T(1), raja_default_desul_order {},
+                       raja_default_desul_scope {});
+  }
 };
 
-template<typename T, typename ReturnType, typename DesulAtomicSignature<ReturnType, T*, const T>::type atomic_impl>
-struct atomicWrapperDesulBinary {
-    /// Call operator overload template that allows invoking DESUL atomic with a (int*)(T*, T) signature
-    RAJA_HOST_DEVICE ReturnType operator()(T * acc, T value) const {
-        return atomic_impl(acc, value, raja_default_desul_order{},
-                        raja_default_desul_scope{});
-    }
+template <
+    typename T,
+    typename ReturnType,
+    typename DesulAtomicSignature<ReturnType, T*, const T>::type atomic_impl>
+struct atomicWrapperDesulBinary
+{
+  /// Call operator overload template that allows invoking DESUL atomic with a
+  /// (int*)(T*, T) signature
+  RAJA_HOST_DEVICE ReturnType operator()(T* acc, T value) const
+  {
+    return atomic_impl(acc, value, raja_default_desul_order {},
+                       raja_default_desul_scope {});
+  }
 };
 
 /// Unary wrapper variant for increment and decrement benchmarks.
-template<typename T, typename ReturnType, typename DesulAtomicSignature<ReturnType, T*>::type atomic_impl>
-struct atomicWrapperDesulUnary {
-    RAJA_HOST_DEVICE ReturnType operator()(T* acc, T) const {
-        return atomic_impl(acc, raja_default_desul_order{},
-                        raja_default_desul_scope{});
-    }
+template <typename T,
+          typename ReturnType,
+          typename DesulAtomicSignature<ReturnType, T*>::type atomic_impl>
+struct atomicWrapperDesulUnary
+{
+  RAJA_HOST_DEVICE ReturnType operator()(T* acc, T) const
+  {
+    return atomic_impl(acc, raja_default_desul_order {},
+                       raja_default_desul_scope {});
+  }
 };
 
-template<typename T>
-class IsDesul : public std::false_type {};
-
-template<typename T, typename ReturnType, typename DesulAtomicSignature<ReturnType, T*, T, T>::type atomic_impl>
-class IsDesul<atomicWrapperDesulTernary<T, ReturnType, atomic_impl>> : public std::true_type {};
-
-template<typename T, typename ReturnType, typename DesulAtomicSignature<ReturnType, T*, T>::type atomic_impl>
-class IsDesul<atomicWrapperDesulBinary<T, ReturnType, atomic_impl>> : public std::true_type {};
-
-template<typename T, typename ReturnType, typename DesulAtomicSignature<ReturnType, T*>::type atomic_impl>
-class IsDesul<atomicWrapperDesulUnary<T, ReturnType, atomic_impl>> : public std::true_type {};
-
-template<typename AtomicImplType>
-std::string GetImplName (const AtomicImplType& impl) {
-    if (IsDesul<AtomicImplType>::value) {
-        return "Desul atomic";
-    } else {
-        return "RAJA atomic";
-    }
+template <typename T>
+class IsDesul : public std::false_type
+{};
+
+template <typename T,
+          typename ReturnType,
+          typename DesulAtomicSignature<ReturnType, T*, T, T>::type atomic_impl>
+class IsDesul<atomicWrapperDesulTernary<T, ReturnType, atomic_impl>>
+    : public std::true_type
+{};
+
+template <typename T,
+          typename ReturnType,
+          typename DesulAtomicSignature<ReturnType, T*, T>::type atomic_impl>
+class IsDesul<atomicWrapperDesulBinary<T, ReturnType, atomic_impl>>
+    : public std::true_type
+{};
+
+template <typename T,
+          typename ReturnType,
+          typename DesulAtomicSignature<ReturnType, T*>::type atomic_impl>
+class IsDesul<atomicWrapperDesulUnary<T, ReturnType, atomic_impl>>
+    : public std::true_type
+{};
+
+template <typename AtomicImplType>
+std::string GetImplName(const AtomicImplType& impl)
+{
+  if (IsDesul<AtomicImplType>::value)
+  {
+    return "Desul atomic";
+  }
+  else
+  {
+    return "RAJA atomic";
+  }
 }
 
-template <class ExecPolicy, typename AtomicType, bool test_array, typename AtomicImplType>
-void TimeAtomicOp(const AtomicImplType& atomic_impl, uint64_t N, uint64_t num_iterations = 4, int array_size = 100, bool print_to_output = true) {
-    RAJA::Timer timer;
-
-    // Allocate memory
-    AtomicType* device_value = nullptr;
-    int len_array = test_array ? array_size : 1;
-    camp::resources::Resource resource {RAJA::resources::get_resource<typename ExecPolicy::policy>::type::get_default()};
-    device_value = resource.allocate<AtomicType>(len_array);
-
-    timer.start();
-    if (test_array) {
-        for (uint64_t i = 0; i < num_iterations; ++i) {
-            RAJA::forall<typename ExecPolicy::policy>(RAJA::TypedRangeSegment<uint64_t>(0, N),
-            [=] RAJA_HOST_DEVICE(uint64_t tid)  {
-                atomic_impl(&(device_value[tid % array_size]), AtomicType(1));
-            });
-        }
-    } else {
-        for (uint64_t i = 0; i < num_iterations; ++i) {
-            RAJA::forall<typename ExecPolicy::policy>(RAJA::TypedRangeSegment<uint64_t>(0, N),
-            [=] RAJA_HOST_DEVICE(uint64_t tid)  {
-                atomic_impl(device_value, AtomicType(1));
-            });
-        }
+template <class ExecPolicy,
+          typename AtomicType,
+          bool test_array,
+          typename AtomicImplType>
+void TimeAtomicOp(const AtomicImplType& atomic_impl,
+                  uint64_t N,
+                  uint64_t num_iterations = 4,
+                  int array_size          = 100,
+                  bool print_to_output    = true)
+{
+  RAJA::Timer timer;
+
+  // Allocate memory
+  AtomicType* device_value = nullptr;
+  int len_array            = test_array ? array_size : 1;
+  camp::resources::Resource resource {RAJA::resources::get_resource<
+      typename ExecPolicy::policy>::type::get_default()};
+  device_value = resource.allocate<AtomicType>(len_array);
+
+  timer.start();
+  if (test_array)
+  {
+    for (uint64_t i = 0; i < num_iterations; ++i)
+    {
+      RAJA::forall<typename ExecPolicy::policy>(
+          RAJA::TypedRangeSegment<uint64_t>(0, N),
+          [=] RAJA_HOST_DEVICE(uint64_t tid)
+          { atomic_impl(&(device_value[tid % array_size]), AtomicType(1)); });
     }
-
-    resource.wait();
-    timer.stop();
-    resource.deallocate(device_value);
-
-    double t = timer.elapsed();
-    if (print_to_output) {
-        std::cout << INDENT << INDENT << t << "s" << INDENT;
-        std::cout << GetImplName(atomic_impl) << ", ";
-        if (test_array) {
-            std::cout << "Number of atomics under contention " << array_size << ", ";
-        }
-        std::cout << num_iterations * N << " many atomic operations" << ", ";
-        std::cout << ExecPolicy::PolicyName();
-        std::cout << std::endl;
+  }
+  else
+  {
+    for (uint64_t i = 0; i < num_iterations; ++i)
+    {
+      RAJA::forall<typename ExecPolicy::policy>(
+          RAJA::TypedRangeSegment<uint64_t>(0, N),
+          [=] RAJA_HOST_DEVICE(uint64_t tid)
+          { atomic_impl(device_value, AtomicType(1)); });
+    }
+  }
+
+  resource.wait();
+  timer.stop();
+  resource.deallocate(device_value);
+
+  double t = timer.elapsed();
+  if (print_to_output)
+  {
+    std::cout << INDENT << INDENT << t << "s" << INDENT;
+    std::cout << GetImplName(atomic_impl) << ", ";
+    if (test_array)
+    {
+      std::cout << "Number of atomics under contention " << array_size << ", ";
     }
+    std::cout << num_iterations * N << " many atomic operations"
+              << ", ";
+    std::cout << ExecPolicy::PolicyName();
+    std::cout << std::endl;
+  }
 }
 
-template<typename ...T>
+template <typename... T>
 struct list_concat;
 
-template<typename ...T, typename...S>
-struct list_concat<camp::list<T...>, camp::list<S...>> {
-    using type = camp::list<T..., S...>;
+template <typename... T, typename... S>
+struct list_concat<camp::list<T...>, camp::list<S...>>
+{
+  using type = camp::list<T..., S...>;
 };
 
-/// Holder for atomic operations that work with arbitrary atomic type, e.g. double, float
-/// and ints etc.
-template<typename AtomicDataType, typename Policy>
-struct universal_atomic_ops {
-    using type = camp::list<std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_add>, AtomicAdd<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_sub>, AtomicSub<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_min>, AtomicMin<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_max>, AtomicMax<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_inc_mod>, AtomicIncBinary<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_dec_mod>, AtomicDecBinary<Policy>>,
-                                                std::pair<atomicWrapperDesulUnary<AtomicDataType, AtomicDataType, desul::atomic_fetch_inc>, AtomicIncUnary<Policy>>,
-                                                std::pair<atomicWrapperDesulUnary<AtomicDataType, AtomicDataType, desul::atomic_fetch_dec>, AtomicDecUnary<Policy>>,
-                                                std::pair<atomicWrapperDesulUnary<const AtomicDataType, AtomicDataType, desul::atomic_load>, AtomicLoad<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, void, desul::atomic_store>, AtomicStore<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_exchange>, AtomicExchange<Policy>>,
-                                                std::pair<atomicWrapperDesulTernary<AtomicDataType, AtomicDataType, desul::atomic_compare_exchange>, AtomicCAS<Policy>>>;
+/// Holder for atomic operations that work with arbitrary atomic type, e.g.
+/// double, float and ints etc.
+template <typename AtomicDataType, typename Policy>
+struct universal_atomic_ops
+{
+  using type = camp::list<
+      std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                         AtomicDataType,
+                                         desul::atomic_fetch_add>,
+                AtomicAdd<Policy>>,
+      std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                         AtomicDataType,
+                                         desul::atomic_fetch_sub>,
+                AtomicSub<Policy>>,
+      std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                         AtomicDataType,
+                                         desul::atomic_fetch_min>,
+                AtomicMin<Policy>>,
+      std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                         AtomicDataType,
+                                         desul::atomic_fetch_max>,
+                AtomicMax<Policy>>,
+      std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                         AtomicDataType,
+                                         desul::atomic_fetch_inc_mod>,
+                AtomicIncBinary<Policy>>,
+      std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                         AtomicDataType,
+                                         desul::atomic_fetch_dec_mod>,
+                AtomicDecBinary<Policy>>,
+      std::pair<atomicWrapperDesulUnary<AtomicDataType,
+                                        AtomicDataType,
+                                        desul::atomic_fetch_inc>,
+                AtomicIncUnary<Policy>>,
+      std::pair<atomicWrapperDesulUnary<AtomicDataType,
+                                        AtomicDataType,
+                                        desul::atomic_fetch_dec>,
+                AtomicDecUnary<Policy>>,
+      std::pair<atomicWrapperDesulUnary<const AtomicDataType,
+                                        AtomicDataType,
+                                        desul::atomic_load>,
+                AtomicLoad<Policy>>,
+      std::pair<
+          atomicWrapperDesulBinary<AtomicDataType, void, desul::atomic_store>,
+          AtomicStore<Policy>>,
+      std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                         AtomicDataType,
+                                         desul::atomic_exchange>,
+                AtomicExchange<Policy>>,
+      std::pair<atomicWrapperDesulTernary<AtomicDataType,
+                                          AtomicDataType,
+                                          desul::atomic_compare_exchange>,
+                AtomicCAS<Policy>>>;
 };
 
-template<typename AtomicDataType, typename Policy>
-struct integral_atomic_ops {
-    using type = camp::list<std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_and>, AtomicAnd<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_or>, AtomicOr<Policy>>,
-                                                std::pair<atomicWrapperDesulBinary<AtomicDataType, AtomicDataType, desul::atomic_fetch_xor>, AtomicXor<Policy>>>;
+template <typename AtomicDataType, typename Policy>
+struct integral_atomic_ops
+{
+  using type =
+      camp::list<std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                                    AtomicDataType,
+                                                    desul::atomic_fetch_and>,
+                           AtomicAnd<Policy>>,
+                 std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                                    AtomicDataType,
+                                                    desul::atomic_fetch_or>,
+                           AtomicOr<Policy>>,
+                 std::pair<atomicWrapperDesulBinary<AtomicDataType,
+                                                    AtomicDataType,
+                                                    desul::atomic_fetch_xor>,
+                           AtomicXor<Policy>>>;
 };
 
-template<typename AtomicDataType, typename Policy, class t = void>
+template <typename AtomicDataType, typename Policy, class t = void>
 struct atomic_ops;
 
 /// Include all atomic ops if the underlying atomic to benchmark is integral.
-template<typename AtomicDataType, typename Policy>
-struct atomic_ops<AtomicDataType, Policy, typename std::enable_if<std::is_integral<AtomicDataType>::value>::type> {
-    using type = typename list_concat<typename universal_atomic_ops<AtomicDataType, Policy>::type, typename integral_atomic_ops<AtomicDataType, Policy>::type>::type;
+template <typename AtomicDataType, typename Policy>
+struct atomic_ops<
+    AtomicDataType,
+    Policy,
+    typename std::enable_if<std::is_integral<AtomicDataType>::value>::type>
+{
+  using type = typename list_concat<
+      typename universal_atomic_ops<AtomicDataType, Policy>::type,
+      typename integral_atomic_ops<AtomicDataType, Policy>::type>::type;
 };
 
 /// Omit bitwise ops and, or, and xor for floating point types
-template<typename AtomicDataType, typename Policy>
-struct atomic_ops<AtomicDataType, Policy, typename std::enable_if<std::is_floating_point<AtomicDataType>::value>::type> {
-    using type = typename universal_atomic_ops< AtomicDataType, Policy >::type;
+template <typename AtomicDataType, typename Policy>
+struct atomic_ops<AtomicDataType,
+                  Policy,
+                  typename std::enable_if<
+                      std::is_floating_point<AtomicDataType>::value>::type>
+{
+  using type = typename universal_atomic_ops<AtomicDataType, Policy>::type;
 };
 
 
-template<typename AtomicDataType, typename AtomicPolicy, typename ExecPolicy>
-void ExecuteBenchmark(uint64_t N) {
-    using ops = atomic_ops<AtomicDataType, AtomicPolicy>;
-    using iter_t = typename ops::type;
-    auto iter = iter_t{};
-    RAJA::for_each_type(iter, [&](auto type_pair) {
+template <typename AtomicDataType, typename AtomicPolicy, typename ExecPolicy>
+void ExecuteBenchmark(uint64_t N)
+{
+  using ops    = atomic_ops<AtomicDataType, AtomicPolicy>;
+  using iter_t = typename ops::type;
+  auto iter    = iter_t {};
+  RAJA::for_each_type(
+      iter,
+      [&](auto type_pair)
+      {
         auto desul_functor = type_pair.first;
-        auto raja_functor = type_pair.second;
-        std::cout << INDENT << "Executing " << raja_functor.name << " integer benchmarks" << std::endl;
-        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(desul_functor, N, 100, 10000);
-        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(raja_functor, N, 100, 10000);
-        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(desul_functor, N, 10, 1000);
-        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(raja_functor, N, 10, 1000);
+        auto raja_functor  = type_pair.second;
+        std::cout << INDENT << "Executing " << raja_functor.name
+                  << " integer benchmarks" << std::endl;
+        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(desul_functor, N, 100,
+                                                       10000);
+        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(raja_functor, N, 100,
+                                                       10000);
+        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(desul_functor, N, 10,
+                                                       1000);
+        TimeAtomicOp<ExecPolicy, AtomicDataType, true>(raja_functor, N, 10,
+                                                       1000);
         TimeAtomicOp<ExecPolicy, AtomicDataType, true>(desul_functor, N, 4, 10);
         TimeAtomicOp<ExecPolicy, AtomicDataType, true>(raja_functor, N, 4, 10);
         // Test contention over a single atomic
         TimeAtomicOp<ExecPolicy, AtomicDataType, false>(desul_functor, N);
         TimeAtomicOp<ExecPolicy, AtomicDataType, false>(raja_functor, N);
-    });
+      });
 }
 
-int main (int argc, char* argv[]) {
-    if (argc > 2) {
-        RAJA_ABORT_OR_THROW("Usage: ./benchmark-atomic.exe <N> where N is the optional size of the benchmark loop");
-    }
-    uint64_t N = 1000000000;
-    if (argc == 2) {
-        N = std::stoll(argv[1]);
-    }
+int main(int argc, char* argv[])
+{
+  if (argc > 2)
+  {
+    RAJA_ABORT_OR_THROW("Usage: ./benchmark-atomic.exe <N> where N is the "
+                        "optional size of the benchmark loop");
+  }
+  uint64_t N = 1000000000;
+  if (argc == 2)
+  {
+    N = std::stoll(argv[1]);
+  }
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  // Perform an untimed initialization of both desul and RAJA atomics.
+  TimeAtomicOp<ExecPolicyGPU<BLOCK_SZ>, int, true>(
+      atomicWrapperDesulBinary<int, int, desul::atomic_fetch_add> {}, N, 10,
+      1000, false);
+  TimeAtomicOp<ExecPolicyGPU<BLOCK_SZ>, int, true>(
+      AtomicAdd<typename GPUAtomic::policy> {}, N, 10, 1000, false);
+  // GPU benchmarks
+  std::cout << "Executing GPU benchmarks" << std::endl;
+  ExecuteBenchmark<double, typename GPUAtomic::policy, ExecPolicyGPU<BLOCK_SZ>>(
+      N);
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP)
+  // Perform an untimed initialization of both desul and RAJA atomics.
+  TimeAtomicOp<ExecPolicyOMP, int, true>(
+      AtomicAdd<typename RAJA::policy::omp::omp_atomic> {}, N, 10, 1000, false);
+  TimeAtomicOp<ExecPolicyOMP, int, true>(
+      atomicWrapperDesulBinary<int, int, desul::atomic_fetch_add> {}, N, 10,
+      1000, false);
+
+  // OpenMP benchmarks
+  std::cout << "Executing OpenMP benchmarks" << std::endl;
+  ExecuteBenchmark<double, typename RAJA::policy::omp::omp_atomic,
+                   ExecPolicyOMP>(N);
+#endif
 
-    #if defined (RAJA_ENABLE_CUDA) || defined (RAJA_ENABLE_HIP)
-    // Perform an untimed initialization of both desul and RAJA atomics.
-    TimeAtomicOp<ExecPolicyGPU<BLOCK_SZ>, int, true>(atomicWrapperDesulBinary<int, int, desul::atomic_fetch_add>{}, N, 10, 1000, false);
-    TimeAtomicOp<ExecPolicyGPU<BLOCK_SZ>, int, true>(AtomicAdd<typename GPUAtomic::policy>{}, N, 10, 1000, false);
-    // GPU benchmarks
-    std::cout << "Executing GPU benchmarks" << std::endl;
-    ExecuteBenchmark<double, typename GPUAtomic::policy, ExecPolicyGPU<BLOCK_SZ>>(N);
-    #endif
-
-    #if defined (RAJA_ENABLE_OPENMP)
-    // Perform an untimed initialization of both desul and RAJA atomics.
-    TimeAtomicOp<ExecPolicyOMP, int, true>(AtomicAdd<typename RAJA::policy::omp::omp_atomic>{}, N, 10, 1000, false);
-    TimeAtomicOp<ExecPolicyOMP, int, true>(atomicWrapperDesulBinary<int, int, desul::atomic_fetch_add>{}, N, 10, 1000, false);
-
-    // OpenMP benchmarks
-    std::cout << "Executing OpenMP benchmarks" << std::endl;
-    ExecuteBenchmark<double, typename RAJA::policy::omp::omp_atomic, ExecPolicyOMP>(N);
-    #endif
-
-    return 0;
+  return 0;
 }
diff --git a/benchmark/host-device-lambda-benchmark.cpp b/benchmark/host-device-lambda-benchmark.cpp
index 33c78e2d8c..dde4f02efb 100644
--- a/benchmark/host-device-lambda-benchmark.cpp
+++ b/benchmark/host-device-lambda-benchmark.cpp
@@ -17,13 +17,15 @@ static void benchmark_daxpy_raw(benchmark::State& state)
   double* a = new double[N];
   double* b = new double[N];
 
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; i++)
+  {
     a[i] = 1.0;
     b[i] = 2.0;
   }
   double c = 3.14159;
 
-  while (state.KeepRunning()) {
+  while (state.KeepRunning())
+  {
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
                                  [=](int i) { a[i] += b[i] * c; });
   }
@@ -34,13 +36,15 @@ static void benchmark_daxpy_host(benchmark::State& state)
   double* a = new double[N];
   double* b = new double[N];
 
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; i++)
+  {
     a[i] = 1.0;
     b[i] = 2.0;
   }
   double c = 3.14159;
 
-  while (state.KeepRunning()) {
+  while (state.KeepRunning())
+  {
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
                                  [=] __host__(int i) { a[i] += b[i] * c; });
   }
@@ -51,17 +55,18 @@ static void benchmark_daxpy_host_device(benchmark::State& state)
   double* a = new double[N];
   double* b = new double[N];
 
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; i++)
+  {
     a[i] = 1.0;
     b[i] = 2.0;
   }
   double c = 3.14159;
 
-  while (state.KeepRunning()) {
+  while (state.KeepRunning())
+  {
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
-                                 [=] RAJA_HOST_DEVICE(int i) {
-                                   a[i] += b[i] * c;
-                                 });
+                                 [=] RAJA_HOST_DEVICE(int i)
+                                 { a[i] += b[i] * c; });
   }
 }
 
diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp
index 15a3e54eb8..b464b77396 100644
--- a/benchmark/ltimes.cpp
+++ b/benchmark/ltimes.cpp
@@ -7,46 +7,54 @@
 
 // Place the following line before including RAJA to enable
 // statistics on the Vector abstractions
-//#define RAJA_ENABLE_VECTOR_STATS
+// #define RAJA_ENABLE_VECTOR_STATS
 
 // Un-comment the following line to run correctness checks on each variant
-//#define DEBUG_LTIMES
-//#define DEBUG_MATRIX_LOAD_STORE
+// #define DEBUG_LTIMES
+// #define DEBUG_MATRIX_LOAD_STORE
 
 #include "RAJA/config.hpp"
 
-#define VARIANT_C                    1
-#define VARIANT_C_VIEWS              1
-#define VARIANT_RAJA_SEQ             1
-#define VARIANT_RAJA_SEQ_ARGS        1
-#define VARIANT_RAJA_TEAMS_SEQ       1
-#define VARIANT_RAJA_VECTOR          1
-#define VARIANT_RAJA_MATRIX          1
-#define VARIANT_RAJA_SEQ_SHMEM       1
+#define VARIANT_C              1
+#define VARIANT_C_VIEWS        1
+#define VARIANT_RAJA_SEQ       1
+#define VARIANT_RAJA_SEQ_ARGS  1
+#define VARIANT_RAJA_TEAMS_SEQ 1
+#define VARIANT_RAJA_VECTOR    1
+#define VARIANT_RAJA_MATRIX    1
+#define VARIANT_RAJA_SEQ_SHMEM 1
 
 #if defined(RAJA_ENABLE_OPENMP)
-#define VARIANT_RAJA_OPENMP          1
+#define VARIANT_RAJA_OPENMP 1
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-#define VARIANT_CUDA_KERNEL          1
-#define VARIANT_CUDA_TEAMS           1
-#define VARIANT_CUDA_TEAMS_MATRIX    1
-#define VARIANT_CUDA_KERNEL_SHMEM    1
+#define VARIANT_CUDA_KERNEL       1
+#define VARIANT_CUDA_TEAMS        1
+#define VARIANT_CUDA_TEAMS_MATRIX 1
+#define VARIANT_CUDA_KERNEL_SHMEM 1
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-#define RAJA_HIP_KERNEL              1
-#define RAJA_HIP_KERNEL_SHMEM        1
+#define RAJA_HIP_KERNEL       1
+#define RAJA_HIP_KERNEL_SHMEM 1
 #endif
 
 
-
 extern "C" {
-  void dgemm_(char * transa, char * transb, int * m, int * n, int * k,
-              double * alpha, double * A, int * lda,
-              double * B, int * ldb, double * beta,
-              double *, int * ldc);
+void dgemm_(char* transa,
+            char* transb,
+            int* m,
+            int* n,
+            int* k,
+            double* alpha,
+            double* A,
+            int* lda,
+            double* B,
+            int* ldb,
+            double* beta,
+            double*,
+            int* ldc);
 }
 
 #include <cstdlib>
@@ -94,7 +102,6 @@ extern "C" {
  */
 
 
-
 using namespace RAJA;
 using namespace RAJA::expt;
 
@@ -115,43 +122,43 @@ RAJA_INDEX_VALUE_T(IZ, int, "IZ");
 // Function to check results
 //
 template <typename PHIVIEW_T, typename LVIEW_T, typename PSIVIEW_T>
-void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi,
+void checkResult(PHIVIEW_T& phi,
+                 LVIEW_T& L,
+                 PSIVIEW_T& psi,
                  const int num_m,
                  const int num_d,
                  const int num_g,
                  const int num_z);
 
 
-
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
   std::cout << "\n\nRAJA LTIMES example...\n\n";
 
-//----------------------------------------------------------------------------//
-// Define array dimensions, allocate arrays, define Layouts and Views, etc.
+  //----------------------------------------------------------------------------//
+  // Define array dimensions, allocate arrays, define Layouts and Views, etc.
   // Note: rand()/RAND_MAX is always zero, but forces the compiler to not
   // optimize out these values as compile time constants
-  const int num_m = 25 + (rand()/RAND_MAX);
-  const int num_g = 160 + (rand()/RAND_MAX);
-  const int num_d = 80 + (rand()/RAND_MAX);
+  const int num_m = 25 + (rand() / RAND_MAX);
+  const int num_g = 160 + (rand() / RAND_MAX);
+  const int num_d = 80 + (rand() / RAND_MAX);
 
 #ifdef DEBUG_LTIMES
-  const int num_iter = 1 ; //+ (rand()/RAND_MAX);;
+  const int num_iter = 1;  //+ (rand()/RAND_MAX);;
   // use a decreased number of zones since this will take a lot longer
   // and we're not really measuring performance here
-  const long num_z = 32 + (rand()/RAND_MAX);
+  const long num_z = 32 + (rand() / RAND_MAX);
 #else
-  const int num_iter = 10 + (rand()/RAND_MAX);
-  const int num_z = 32*657 + (rand()/RAND_MAX);
+  const int num_iter = 10 + (rand() / RAND_MAX);
+  const int num_z    = 32 * 657 + (rand() / RAND_MAX);
 
 #endif
 
 
+  double total_flops = 2.0 * num_g * num_z * num_d * num_m * num_iter * 1000.0;
 
-  double total_flops = 2.0*num_g*num_z*num_d*num_m*num_iter*1000.0;
-
-  std::cout << "num_m = " << num_m << ", num_g = " << num_g <<
-               ", num_d = " << num_d << ", num_z = " << num_z << "\n\n";
+  std::cout << "num_m = " << num_m << ", num_g = " << num_g
+            << ", num_d = " << num_d << ", num_z = " << num_z << "\n\n";
 
   std::cout << "total flops:  " << (long)total_flops << "\n";
 
@@ -167,1133 +174,1076 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   double* psi_data = &psi_vec[0];
   double* phi_data = &phi_vec[0];
 
-  for (int i = 0; i < L_size; ++i) {
-    L_data[i] = i+1;
+  for (int i = 0; i < L_size; ++i)
+  {
+    L_data[i] = i + 1;
   }
 
-  for (int i = 0; i < psi_size; ++i) {
-    psi_data[i] = 2*i+1;
+  for (int i = 0; i < psi_size; ++i)
+  {
+    psi_data[i] = 2 * i + 1;
   }
 
   // Note phi_data will be set to zero before each variant is run.
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_C
-{
-  std::cout << "\n Running baseline C-version of LTimes...\n";
+  {
+    std::cout << "\n Running baseline C-version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  // Using restrict doesn't make much of a difference for most compilers.
+    // Using restrict doesn't make much of a difference for most compilers.
 #if 1
-  double * RAJA_RESTRICT L = L_data;
-  double * RAJA_RESTRICT psi = psi_data;
-  double * RAJA_RESTRICT phi = phi_data;
+    double* RAJA_RESTRICT L   = L_data;
+    double* RAJA_RESTRICT psi = psi_data;
+    double* RAJA_RESTRICT phi = phi_data;
 #else
-  double * L = L_data;
-  double * psi = psi_data;
-  double * phi = phi_data;
+    double* L   = L_data;
+    double* psi = psi_data;
+    double* phi = phi_data;
 #endif
 
-  RAJA::Timer timer;
-  timer.start();
+    RAJA::Timer timer;
+    timer.start();
 
-  for (int iter = 0;iter < num_iter;++ iter)
-    for (int g = 0; g < num_g; ++g) {
-      for (int z = 0; z < num_z; ++z) {
-        for (int m = 0; m < num_m; ++m) {
-          for (int d = 0; d < num_d; ++d) {
-            phi[g*num_z*num_m + z*num_m + m] +=
-              L[d*num_m + m] * psi[g*num_z*num_d + z*num_d + d];
+    for (int iter = 0; iter < num_iter; ++iter)
+      for (int g = 0; g < num_g; ++g)
+      {
+        for (int z = 0; z < num_z; ++z)
+        {
+          for (int m = 0; m < num_m; ++m)
+          {
+            for (int d = 0; d < num_d; ++d)
+            {
+              phi[g * num_z * num_m + z * num_m + m] +=
+                  L[d * num_m + m] * psi[g * num_z * num_d + z * num_d + d];
+            }
+          }
         }
       }
-    }
-  }
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  C-version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
-
-}
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  C-version of LTimes run time (sec.): " << t
+              << ", GFLOPS/sec: " << gflop_rate << std::endl;
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_C_VIEWS
-{
-  std::cout << "\n Running C-version of LTimes (with Views)...\n";
+  {
+    std::cout << "\n Running C-version of LTimes (with Views)...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
 
-  std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
 
-  RAJA::Timer timer;
-  timer.start();
+    RAJA::Timer timer;
+    timer.start();
 
-  for (int iter = 0;iter < num_iter;++ iter)
-    for (IG g(0); g < num_g; ++g) {
-      for (IZ z(0); z < num_z; ++z) {
-        for (IM m(0); m < num_m; ++m) {
-          for (ID d(0); d < num_d; ++d) {
-            phi(m, g, z) += L(m, d) * psi(d, g, z);
+    for (int iter = 0; iter < num_iter; ++iter)
+      for (IG g(0); g < num_g; ++g)
+      {
+        for (IZ z(0); z < num_z; ++z)
+        {
+          for (IM m(0); m < num_m; ++m)
+          {
+            for (ID d(0); d < num_d; ++d)
+            {
+              phi(m, g, z) += L(m, d) * psi(d, g, z);
+            }
+          }
         }
       }
-    }
-  }
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  C-version of LTimes run time (with Views) (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  C-version of LTimes run time (with Views) (sec.): " << t
+              << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_RAJA_SEQ
-{
-  std::cout << "\n Running RAJA sequential version of LTimes...\n";
-
-  std::memset(phi_data, 0, phi_size * sizeof(double));
-
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
-
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
-
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
-
-  std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
-
-  std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
-
-  std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
-
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::For<2, seq_exec,  // g
-        statement::For<3, seq_exec,  // z
-         statement::For<0, seq_exec,  // m
-           statement::For<1, simd_exec,  // d
-               statement::Lambda<0>
-             >
-           >
-         >
-       >
-     >;
-
-  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
-                                   RAJA::TypedRangeSegment<ID>(0, num_d),
-                                   RAJA::TypedRangeSegment<IG>(0, num_g),
-                                   RAJA::TypedRangeSegment<IZ>(0, num_z));
-
-  RAJA::Timer timer;
-  timer.start();
-
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel<EXECPOL>( segments,
-    [=] (IM m, ID d, IG g, IZ z) {
-       phi(m, g, z) += L(m, d) * psi(d, g, z);
-    }
-  );
+  {
+    std::cout << "\n Running RAJA sequential version of LTimes...\n";
+
+    std::memset(phi_data, 0, phi_size * sizeof(double));
+
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
+
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
+
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA sequential version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+
+    std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+
+    std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+
+    using EXECPOL = RAJA::KernelPolicy<statement::For<
+        2, seq_exec,                                                // g
+        statement::For<3, seq_exec,                                 // z
+                       statement::For<0, seq_exec,                  // m
+                                      statement::For<1, simd_exec,  // d
+                                                     statement::Lambda<0>>>>>>;
+
+    auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                                     RAJA::TypedRangeSegment<ID>(0, num_d),
+                                     RAJA::TypedRangeSegment<IG>(0, num_g),
+                                     RAJA::TypedRangeSegment<IZ>(0, num_z));
+
+    RAJA::Timer timer;
+    timer.start();
+
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel<EXECPOL>(segments, [=](IM m, ID d, IG g, IZ z)
+                            { phi(m, g, z) += L(m, d) * psi(d, g, z); });
+
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA sequential version of LTimes run time (sec.): " << t
+              << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_RAJA_SEQ_ARGS
-{
-  std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n";
-
-  std::memset(phi_data, 0, phi_size * sizeof(double));
-
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
-
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
-
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
-
-  std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
-
-  std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
-
-  std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
-
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::For<2, seq_exec,  // g
-        statement::For<3, seq_exec,  // z
-           statement::For<0, seq_exec,  // m
-             statement::For<1, simd_exec,  // d
-               statement::Lambda<0, Segs<0, 1, 2, 3>>
-             >
-           >
-         >
-       >
-     >;
-
-  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
-                                   RAJA::TypedRangeSegment<ID>(0, num_d),
-                                   RAJA::TypedRangeSegment<IG>(0, num_g),
-                                   RAJA::TypedRangeSegment<IZ>(0, num_z));
-
-  RAJA::Timer timer;
-  timer.start();
-
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel<EXECPOL>( segments,
-    [=] (IM m, ID d, IG g, IZ z) {
-       phi(m, g, z) += L(m, d) * psi(d, g, z);
-    }
-  );
+  {
+    std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n";
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA sequential ARGS version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
 
-#if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
-#endif
-}
-#endif
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
 
-//----------------------------------------------------------------------------//
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
 
-#if VARIANT_RAJA_TEAMS_SEQ
-{
-  std::cout << "\n Running RAJA Teams sequential version of LTimes...\n";
+    std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
+    std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
+    using EXECPOL = RAJA::KernelPolicy<statement::For<
+        2, seq_exec,  // g
+        statement::For<
+            3, seq_exec,  // z
+            statement::For<
+                0, seq_exec,                  // m
+                statement::For<1, simd_exec,  // d
+                               statement::Lambda<0, Segs<0, 1, 2, 3>>>>>>>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
+    auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                                     RAJA::TypedRangeSegment<ID>(0, num_d),
+                                     RAJA::TypedRangeSegment<IG>(0, num_g),
+                                     RAJA::TypedRangeSegment<IZ>(0, num_z));
 
+    RAJA::Timer timer;
+    timer.start();
 
-  std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel<EXECPOL>(segments, [=](IM m, ID d, IG g, IZ z)
+                            { phi(m, g, z) += L(m, d) * psi(d, g, z); });
 
-  std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA sequential ARGS version of LTimes run time (sec.): "
+              << t << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
-  std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
+#if defined(DEBUG_LTIMES)
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+#endif
+  }
+#endif
 
-  using pol_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
-  using pol_g = RAJA::LoopPolicy<RAJA::seq_exec>;
-  using pol_z = RAJA::LoopPolicy<RAJA::seq_exec>;
-  using pol_m = RAJA::LoopPolicy<RAJA::seq_exec>;
-  using pol_d = RAJA::LoopPolicy<RAJA::seq_exec>;
+  //----------------------------------------------------------------------------//
 
+#if VARIANT_RAJA_TEAMS_SEQ
+  {
+    std::cout << "\n Running RAJA Teams sequential version of LTimes...\n";
 
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  RAJA::Timer timer;
-  timer.start();
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
 
-  for (int iter = 0;iter < num_iter;++ iter){
-    RAJA::launch<pol_launch>(RAJA::ExecPlace::HOST, RAJA::LaunchParams(), [=](RAJA::LaunchContext ctx){
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
 
-      RAJA::loop<pol_g>(ctx, RAJA::TypedRangeSegment<IG>(0, num_g), [&](IG g){
-        RAJA::loop<pol_z>(ctx, RAJA::TypedRangeSegment<IZ>(0, num_z), [&](IZ z){
-          RAJA::loop<pol_m>(ctx, RAJA::TypedRangeSegment<IM>(0, num_m), [&](IM m){
-            RAJA::loop<pol_d>(ctx, RAJA::TypedRangeSegment<ID>(0, num_d), [&](ID d){
-              phi(m, g, z) += L(m, d) * psi(d, g, z);
-            });
-          });
-        });
-      });
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
 
-    }); // laucnch
-  } // iter
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA Teams sequential version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
+    std::array<RAJA::idx_t, 3> psi_perm {{2, 1, 0}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-#if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
-#endif
+    std::array<RAJA::idx_t, 3> phi_perm {{2, 1, 0}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
 
-}
+    using pol_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+    using pol_g      = RAJA::LoopPolicy<RAJA::seq_exec>;
+    using pol_z      = RAJA::LoopPolicy<RAJA::seq_exec>;
+    using pol_m      = RAJA::LoopPolicy<RAJA::seq_exec>;
+    using pol_d      = RAJA::LoopPolicy<RAJA::seq_exec>;
+
+
+    RAJA::Timer timer;
+    timer.start();
+
+    for (int iter = 0; iter < num_iter; ++iter)
+    {
+      RAJA::launch<pol_launch>(
+          RAJA::ExecPlace::HOST, RAJA::LaunchParams(),
+          [=](RAJA::LaunchContext ctx)
+          {
+            RAJA::loop<pol_g>(
+                ctx, RAJA::TypedRangeSegment<IG>(0, num_g),
+                [&](IG g)
+                {
+                  RAJA::loop<pol_z>(
+                      ctx, RAJA::TypedRangeSegment<IZ>(0, num_z),
+                      [&](IZ z)
+                      {
+                        RAJA::loop<pol_m>(
+                            ctx, RAJA::TypedRangeSegment<IM>(0, num_m),
+                            [&](IM m)
+                            {
+                              RAJA::loop<pol_d>(
+                                  ctx, RAJA::TypedRangeSegment<ID>(0, num_d),
+                                  [&](ID d)
+                                  { phi(m, g, z) += L(m, d) * psi(d, g, z); });
+                            });
+                      });
+                });
+          });  // laucnch
+    }          // iter
+
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA Teams sequential version of LTimes run time (sec.): "
+              << t << ", GFLOPS/sec: " << gflop_rate << std::endl;
+
+
+#if defined(DEBUG_LTIMES)
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+#endif
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_RAJA_VECTOR
-{
-  std::cout << "\n Running RAJA vectorized version of LTimes...\n";
+  {
+    std::cout << "\n Running RAJA vectorized version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
 
-  std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{1, 0, 2}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{1, 0, 2}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{1, 0, 2}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{1, 0, 2}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
-  using vector_t = RAJA::expt::VectorRegister<double>;
-  using VecIZ = RAJA::expt::VectorIndex<IZ, vector_t>;
+    using vector_t = RAJA::expt::VectorRegister<double>;
+    using VecIZ    = RAJA::expt::VectorIndex<IZ, vector_t>;
 
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::For<2, seq_exec,  // g
-        statement::For<0, seq_exec,  // m
-          statement::For<1, seq_exec,  // d
+    using EXECPOL = RAJA::KernelPolicy<
+        statement::For<2, seq_exec,                                // g
+                       statement::For<0, seq_exec,                 // m
+                                      statement::For<1, seq_exec,  // d
 
-             statement::Lambda<0>
-           >
-         >
-       >
-     >;
+                                                     statement::Lambda<0>>>>>;
 
 
 #ifdef RAJA_ENABLE_VECTOR_STATS
-  RAJA::expt::tensor_stats::resetVectorStats();
+    RAJA::expt::tensor_stats::resetVectorStats();
 #endif
 
 
-  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
-                                   RAJA::TypedRangeSegment<ID>(0, num_d),
-                                   RAJA::TypedRangeSegment<IG>(0, num_g));
+    auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                                     RAJA::TypedRangeSegment<ID>(0, num_d),
+                                     RAJA::TypedRangeSegment<IG>(0, num_g));
 
-  RAJA::Timer timer;
-  timer.start();
+    RAJA::Timer timer;
+    timer.start();
 
 
-  auto all_z = VecIZ::all();
+    auto all_z = VecIZ::all();
 
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel<EXECPOL>( segments,
-    [=] (IM m, ID d, IG g) {
-       phi(m, g, all_z) += L(m, d) * psi(d, g, all_z);
-    }
-  );
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel<EXECPOL>(segments,
+                            [=](IM m, ID d, IG g) {
+                              phi(m, g, all_z) += L(m, d) * psi(d, g, all_z);
+                            });
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA vectorized version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA vectorized version of LTimes run time (sec.): " << t
+              << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 #ifdef RAJA_ENABLE_VECTOR_STATS
-  RAJA::tensor_stats::printVectorStats();
+    RAJA::tensor_stats::printVectorStats();
 #endif
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_RAJA_MATRIX
-{
-  std::cout << "\n Running RAJA column-major matrix version of LTimes...\n";
+  {
+    std::cout << "\n Running RAJA column-major matrix version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
 
 
-  std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{1, 2, 0}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{1, 2, 0}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{1, 2, 0}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{1, 2, 0}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
-  using matrix_t = RAJA::expt::SquareMatrixRegister<double, ColMajorLayout, RAJA::expt::scalar_register>;
-  //using matrix_t = RAJA::expt::SquareMatrixRegister<double, RowMajorLayout>;
-//  using matrix_t = RAJA::expt::RectMatrixRegister<double, RAJA::ColMajorLayout, 8,8>;
+    using matrix_t =
+        RAJA::expt::SquareMatrixRegister<double, ColMajorLayout,
+                                         RAJA::expt::scalar_register>;
+    // using matrix_t = RAJA::expt::SquareMatrixRegister<double,
+    // RowMajorLayout>;
+    //  using matrix_t = RAJA::expt::RectMatrixRegister<double,
+    //  RAJA::ColMajorLayout, 8,8>;
 
 
+    std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x"
+              << matrix_t::s_dim_elem(1) << std::endl;
 
-	std::cout << "matrix size: " << matrix_t::s_dim_elem(0) <<
-	    "x" << matrix_t::s_dim_elem(1) << std::endl;
+    printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers);
 
-	printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers);
-
-  using RowM = RAJA::expt::RowIndex<IM, matrix_t>;
-  using ColD = RAJA::expt::ColIndex<ID, matrix_t>;
-  using ColZ = RAJA::expt::ColIndex<IZ, matrix_t>;
+    using RowM = RAJA::expt::RowIndex<IM, matrix_t>;
+    using ColD = RAJA::expt::ColIndex<ID, matrix_t>;
+    using ColZ = RAJA::expt::ColIndex<IZ, matrix_t>;
 
 
 #ifdef RAJA_ENABLE_VECTOR_STATS
-  RAJA::tensor_stats::resetVectorStats();
+    RAJA::tensor_stats::resetVectorStats();
 #endif
 
-  RAJA::Timer timer;
-  timer.start();
-
+    RAJA::Timer timer;
+    timer.start();
 
-  for (int iter = 0;iter < num_iter;++ iter){
 
-  RAJA::forall<seq_exec>(RAJA::TypedRangeSegment<IG>(0, num_g),
-    [=](IG g)
+    for (int iter = 0; iter < num_iter; ++iter)
     {
 
-        auto rows_m = RowM::all();
-        auto cols_z = ColZ::all();
-        auto cols_d = ColD::all();
-        auto rows_d = toRowIndex(cols_d);
-
-        phi(rows_m, g, cols_z) +=
-            L(rows_m, cols_d) * psi(rows_d, g, cols_z);
-
-//      phi(rows_m, g, cols_z) = (L(rows_m, cols_d) * psi(rows_d, g, cols_z)) * (L(rows_m, cols_d) * psi(rows_d, g, cols_z));
-
-    });
-
-
-
-  }
+      RAJA::forall<seq_exec>(RAJA::TypedRangeSegment<IG>(0, num_g),
+                             [=](IG g)
+                             {
+                               auto rows_m = RowM::all();
+                               auto cols_z = ColZ::all();
+                               auto cols_d = ColD::all();
+                               auto rows_d = toRowIndex(cols_d);
+
+                               phi(rows_m, g, cols_z) +=
+                                   L(rows_m, cols_d) * psi(rows_d, g, cols_z);
+
+                               //      phi(rows_m, g, cols_z) = (L(rows_m,
+                               //      cols_d) * psi(rows_d, g, cols_z)) *
+                               //      (L(rows_m, cols_d) * psi(rows_d, g,
+                               //      cols_z));
+                             });
+    }
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA column-major matrix version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout
+        << "  RAJA column-major matrix version of LTimes run time (sec.): " << t
+        << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 #ifdef RAJA_ENABLE_VECTOR_STATS
-  RAJA::tensor_stats::printVectorStats();
+    RAJA::tensor_stats::printVectorStats();
 #endif
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-
-
-}
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_RAJA_MATRIX
-{
-  std::cout << "\n Running RAJA row-major matrix version of LTimes...\n";
+  {
+    std::cout << "\n Running RAJA row-major matrix version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
 
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{1, 0, 2}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{1, 0, 2}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{1, 0, 2}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{1, 0, 2}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
-  using matrix_t = RAJA::expt::SquareMatrixRegister<double, RowMajorLayout>;
+    using matrix_t = RAJA::expt::SquareMatrixRegister<double, RowMajorLayout>;
 
 
-    std::cout << "matrix size: " << matrix_t::s_dim_elem(0) <<
-        "x" << matrix_t::s_dim_elem(1) << std::endl;
+    std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x"
+              << matrix_t::s_dim_elem(1) << std::endl;
 
     using RowM = RAJA::expt::RowIndex<IM, matrix_t>;
     using ColD = RAJA::expt::ColIndex<ID, matrix_t>;
     using ColZ = RAJA::expt::ColIndex<IZ, matrix_t>;
 
 
-  #ifdef RAJA_ENABLE_VECTOR_STATS
+#ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::expt::tensor_stats::resetVectorStats();
-  #endif
+#endif
 
     RAJA::Timer timer;
     timer.start();
 
-    for (int iter = 0;iter < num_iter;++ iter){
-
-    RAJA::forall<seq_exec>(RAJA::TypedRangeSegment<IG>(0, num_g),
-      [=](IG g)
-      {
-
-        auto rows_m = RowM::all();
-        auto cols_z = ColZ::all();
-        auto cols_d = ColD::all();
-        auto rows_d = toRowIndex(cols_d);
-
-          phi(rows_m, g, cols_z) +=
-              L(rows_m, cols_d) * psi(rows_d, g, cols_z);
-
-      });
-
-
+    for (int iter = 0; iter < num_iter; ++iter)
+    {
 
+      RAJA::forall<seq_exec>(RAJA::TypedRangeSegment<IG>(0, num_g),
+                             [=](IG g)
+                             {
+                               auto rows_m = RowM::all();
+                               auto cols_z = ColZ::all();
+                               auto cols_d = ColD::all();
+                               auto rows_d = toRowIndex(cols_d);
+
+                               phi(rows_m, g, cols_z) +=
+                                   L(rows_m, cols_d) * psi(rows_d, g, cols_z);
+                             });
     }
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA row-major matrix version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA row-major matrix version of LTimes run time (sec.): "
+              << t << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 #ifdef RAJA_ENABLE_VECTOR_STATS
-  RAJA::tensor_stats::printVectorStats();
+    RAJA::tensor_stats::printVectorStats();
 #endif
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-
-
-}
+  }
 #endif
 
 
 //----------------------------------------------------------------------------//
 #if VARIANT_RAJA_SEQ_SHMEM
-{
-  std::cout << "\n Running RAJA sequential shmem version of LTimes...\n";
+  {
+    std::cout << "\n Running RAJA sequential shmem version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
-
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+    std::memset(phi_data, 0, phi_size * sizeof(double));
+
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
 
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
-  constexpr size_t tile_m = 25;
-  constexpr size_t tile_d = 80;
-  constexpr size_t tile_z = 256;
-  constexpr size_t tile_g = 0;
+    constexpr size_t tile_m = 25;
+    constexpr size_t tile_d = 80;
+    constexpr size_t tile_z = 256;
+    constexpr size_t tile_g = 0;
 
-  using RAJA::statement::Param;
+    using RAJA::statement::Param;
 
-  using EXECPOL =
-    RAJA::KernelPolicy<
+    using EXECPOL = RAJA::KernelPolicy<
 
-      // Create memory tiles
-      statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1,2>,
+        // Create memory tiles
+        statement::InitLocalMem<
+            RAJA::cpu_tile_mem, RAJA::ParamList<0, 1, 2>,
 
-      // Tile outer m,d loops
-      statement::Tile<0, tile_fixed<tile_m>, seq_exec,  // m
-        statement::Tile<1, tile_fixed<tile_d>, seq_exec,  // d
+            // Tile outer m,d loops
+            statement::Tile<
+                0, tile_fixed<tile_m>, seq_exec,  // m
+                statement::Tile<
+                    1, tile_fixed<tile_d>, seq_exec,  // d
 
 
-            // Load L(m,d) for m,d tile into shmem
-            statement::For<0, seq_exec,  // m
-              statement::For<1, seq_exec,  // d
-                statement::Lambda<0, Segs<0, 1>,
-                                     Params<0>,
-                                     Offsets<0, 1>>
-              >
-            >,
+                    // Load L(m,d) for m,d tile into shmem
+                    statement::For<0, seq_exec,                 // m
+                                   statement::For<1, seq_exec,  // d
+                                                  statement::Lambda<
+                                                      0, Segs<0, 1>, Params<0>,
+                                                      Offsets<0, 1>>>>,
 
-            // Run inner g, z loops with z loop tiled
-            statement::For<2, seq_exec,  // g
-              statement::Tile<3, tile_fixed<tile_z>, seq_exec,  // z
+                    // Run inner g, z loops with z loop tiled
+                    statement::For<
+                        2, seq_exec,                                      // g
+                        statement::Tile<3, tile_fixed<tile_z>, seq_exec,  // z
 
 
-                  // Load psi into shmem
-                  statement::For<1, seq_exec,  // d
-                    statement::For<3, seq_exec,  // z
-                      statement::Lambda<1, Segs<1, 2, 3>,
-                                           Params<1>,
-                                           Offsets<1, 2, 3>>
-                    >
-                  >,
+                                        // Load psi into shmem
+                                        statement::For<
+                                            1, seq_exec,  // d
+                                            statement::For<
+                                                3, seq_exec,  // z
+                                                statement::Lambda<
+                                                    1, Segs<1, 2, 3>, Params<1>,
+                                                    Offsets<1, 2, 3>>>>,
 
-                  // Compute phi
-                  statement::For<0, seq_exec,  // m
+                                        // Compute phi
+                                        statement::For<
+                                            0, seq_exec,  // m
 
-                    // Load phi into shmem
-                    statement::For<3, seq_exec,  // z
-                      statement::Lambda<2, Segs<0, 2, 3>,
-                                           Params<2>,
-                                           Offsets<0, 2, 3>>
-                    >,
+                                            // Load phi into shmem
+                                            statement::For<
+                                                3, seq_exec,  // z
+                                                statement::Lambda<
+                                                    2, Segs<0, 2, 3>, Params<2>,
+                                                    Offsets<0, 2, 3>>>,
 
-                    // Compute phi in shmem
-                    statement::For<1, seq_exec,  // d
-                      statement::For<3, seq_exec,  // z
-                        statement::Lambda<3, Params<0, 1, 2>,
-                                             Offsets<0, 1, 2, 3>>
-                      >
-                    >,
+                                            // Compute phi in shmem
+                                            statement::For<
+                                                1, seq_exec,  // d
+                                                statement::For<
+                                                    3, seq_exec,  // z
+                                                    statement::Lambda<
+                                                        3, Params<0, 1, 2>,
+                                                        Offsets<0, 1, 2, 3>>>>,
 
-                    // Store phi
-                    statement:: For<3, seq_exec,  // z
-                      statement::Lambda<4, Segs<0, 2, 3>,
-                                           Params<2>,
-                                           Offsets<0, 2, 3>>
-                    >
-                  >  // m
+                                            // Store phi
+                                            statement::For<
+                                                3, seq_exec,  // z
+                                                statement::Lambda<
+                                                    4, Segs<0, 2, 3>, Params<2>,
+                                                    Offsets<0, 2, 3>>>>  // m
 
 
-              >  // Tile z
-            >  // g
+                                        >  // Tile z
+                        >                  // g
 
 
-        >  // Tile d
-      >  // Tile m
-      > // LocalMemory
-    >; // KernelPolicy
+                    >  // Tile d
+                >      // Tile m
+            >          // LocalMemory
+        >;             // KernelPolicy
 
 
+    //
+    // Define statically dimensioned local arrays used in kernel
+    //
 
-  //
-  // Define statically dimensioned local arrays used in kernel
-  //
+    using shmem_L_t =
+        RAJA::TypedLocalArray<double, RAJA::PERM_JI,
+                              RAJA::SizeList<tile_m, tile_d>, IM, ID>;
+    shmem_L_t shmem_L;
 
-  using shmem_L_t = RAJA::TypedLocalArray<double,
-                        RAJA::PERM_JI,
-                        RAJA::SizeList<tile_m, tile_d>,
-                        IM, ID>;
-  shmem_L_t shmem_L;
 
+    using shmem_psi_t =
+        RAJA::TypedLocalArray<double, RAJA::PERM_IJK,
+                              RAJA::SizeList<tile_d, tile_g, tile_z>, ID, IG,
+                              IZ>;
+    shmem_psi_t shmem_psi;
 
-  using shmem_psi_t = RAJA::TypedLocalArray<double,
-                        RAJA::PERM_IJK,
-                        RAJA::SizeList<tile_d, tile_g, tile_z>,
-                        ID, IG, IZ>;
-  shmem_psi_t shmem_psi;
 
+    using shmem_phi_t =
+        RAJA::TypedLocalArray<double, RAJA::PERM_IJK,
+                              RAJA::SizeList<tile_m, tile_g, tile_z>, IM, IG,
+                              IZ>;
+    shmem_phi_t shmem_phi;
 
-  using shmem_phi_t = RAJA::TypedLocalArray<double,
-                        RAJA::PERM_IJK,
-                        RAJA::SizeList<tile_m, tile_g, tile_z>,
-                        IM, IG, IZ>;
-  shmem_phi_t shmem_phi;
 
+    RAJA::Timer timer;
+    timer.start();
 
-  RAJA::Timer timer;
-  timer.start();
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel_param<EXECPOL>(
 
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel_param<EXECPOL>(
+          RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                           RAJA::TypedRangeSegment<ID>(0, num_d),
+                           RAJA::TypedRangeSegment<IG>(0, num_g),
+                           RAJA::TypedRangeSegment<IZ>(0, num_z)),
+          // For kernel_param, second arg is a tuple of data objects used in
+          // lambdas. They are the last args in all lambdas (after indices).
+          RAJA::make_tuple(shmem_L, shmem_psi, shmem_phi),
 
-    RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
-                     RAJA::TypedRangeSegment<ID>(0, num_d),
-                     RAJA::TypedRangeSegment<IG>(0, num_g),
-                     RAJA::TypedRangeSegment<IZ>(0, num_z)),
-    // For kernel_param, second arg is a tuple of data objects used in lambdas.
-    // They are the last args in all lambdas (after indices).
-    RAJA::make_tuple( shmem_L,
-                      shmem_psi,
-                      shmem_phi),
-
-
-    // Lambda<0> : Load L into shmem
-    [=] (IM m, ID d,
-         shmem_L_t& sh_L,
-         IM tm, ID td)
-    {
-      sh_L(tm, td) = L(m, d);
-    },
 
-    // Lambda<1> : Load psi into shmem
-    [=] (ID d, IG g, IZ z,
-         shmem_psi_t& sh_psi,
-         ID td, IG tg, IZ tz)
-    {
-      sh_psi(td, tg, tz) = psi(d, g, z);
-    },
+          // Lambda<0> : Load L into shmem
+          [=](IM m, ID d, shmem_L_t& sh_L, IM tm, ID td)
+          { sh_L(tm, td) = L(m, d); },
 
-    // Lambda<2> : Load phi into shmem
-    [=] (IM m, IG g, IZ z,
-         shmem_phi_t& sh_phi,
-         IM tm, IG tg, IZ tz)
-    {
-      sh_phi(tm, tg, tz) = phi(m, g, z);
-    },
+          // Lambda<1> : Load psi into shmem
+          [=](ID d, IG g, IZ z, shmem_psi_t& sh_psi, ID td, IG tg, IZ tz)
+          { sh_psi(td, tg, tz) = psi(d, g, z); },
 
-    // Lambda<3> : Compute phi in shmem
-    [=] (shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi,
-        IM tm, ID td, IG tg, IZ tz)
-    {
-      sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz);
-    },
+          // Lambda<2> : Load phi into shmem
+          [=](IM m, IG g, IZ z, shmem_phi_t& sh_phi, IM tm, IG tg, IZ tz)
+          { sh_phi(tm, tg, tz) = phi(m, g, z); },
 
-    // Lambda<4> : Store phi
-    [=] (IM m, IG g, IZ z,
-         shmem_phi_t& sh_phi,
-         IM tm, IG tg, IZ tz)
-    {
-      phi(m, g, z) = sh_phi(tm, tg, tz);
-    }
+          // Lambda<3> : Compute phi in shmem
+          [=](shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi, IM tm,
+              ID td, IG tg, IZ tz)
+          { sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz); },
 
-  );
+          // Lambda<4> : Store phi
+          [=](IM m, IG g, IZ z, shmem_phi_t& sh_phi, IM tm, IG tg, IZ tz)
+          { phi(m, g, z) = sh_phi(tm, tg, tz); }
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA sequential shmem version of LTimes run time (sec.): "
-            << t <<", GFLOPS/sec: " << gflop_rate << std::endl;
+      );
+
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA sequential shmem version of LTimes run time (sec.): "
+              << t << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_OPENMP) && (VARIANT_RAJA_OPENMP)
-{
-  std::cout << "\n Running RAJA OpenMP version of LTimes...\n";
+  {
+    std::cout << "\n Running RAJA OpenMP version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
 
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(L_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(psi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+    PsiView psi(psi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView phi(phi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+    PhiView phi(phi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
 
 #if 1
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::For<0, omp_parallel_for_exec,  // m
-        statement::For<1, seq_exec,  // d
-          statement::For<2, seq_exec,  // g
-            statement::For<3, simd_exec,  // z
-              statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
+    using EXECPOL = RAJA::KernelPolicy<statement::For<
+        0, omp_parallel_for_exec,                                   // m
+        statement::For<1, seq_exec,                                 // d
+                       statement::For<2, seq_exec,                  // g
+                                      statement::For<3, simd_exec,  // z
+                                                     statement::Lambda<0>>>>>>;
 #else
-  //
-  // Benefits of using OpenMP collapse depends on compiler, platform,
-  // relative segment sizes.
-  //
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::Collapse<omp_parallel_collapse_exec,
-                          RAJA::ArgList<0, 2, 3>,   // m, g, z
-        statement::For<1, seq_exec,  // d
-          statement::Lambda<0>
-        >
-      >
-    >;
+    //
+    // Benefits of using OpenMP collapse depends on compiler, platform,
+    // relative segment sizes.
+    //
+    using EXECPOL = RAJA::KernelPolicy<statement::Collapse<
+        omp_parallel_collapse_exec, RAJA::ArgList<0, 2, 3>,  // m, g, z
+        statement::For<1, seq_exec,                          // d
+                       statement::Lambda<0>>>>;
 #endif
 
-  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
-                                   RAJA::TypedRangeSegment<ID>(0, num_d),
-                                   RAJA::TypedRangeSegment<IG>(0, num_g),
-                                   RAJA::TypedRangeSegment<IZ>(0, num_z));
+    auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                                     RAJA::TypedRangeSegment<ID>(0, num_d),
+                                     RAJA::TypedRangeSegment<IG>(0, num_g),
+                                     RAJA::TypedRangeSegment<IZ>(0, num_z));
 
-  RAJA::Timer timer;
-  timer.start();
+    RAJA::Timer timer;
+    timer.start();
 
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel<EXECPOL>( segments,
-    [=] (IM m, ID d, IG g, IZ z) {
-       phi(m, g, z) += L(m, d) * psi(d, g, z);
-    }
-  );
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel<EXECPOL>(segments, [=](IM m, ID d, IG g, IZ z)
+                            { phi(m, g, z) += L(m, d) * psi(d, g, z); });
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA OpenMP version of LTimes run time (sec.): "
-            << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA OpenMP version of LTimes run time (sec.): "
+              << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_CUDA_KERNEL
-{
-  std::cout << "\n Running RAJA CUDA version of LTimes...\n";
-
-  std::memset(phi_data, 0, phi_size * sizeof(double));
-
-  double* dL_data   = nullptr;
-  double* dpsi_data = nullptr;
-  double* dphi_data = nullptr;
-
-  cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
-
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
-
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
-
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(dL_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
-
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(dpsi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
-
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView phi(dphi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
-
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::CudaKernelAsync<
-        statement::For<0, cuda_block_x_loop,  // m
-          statement::For<2, cuda_block_y_loop,  // g
-            statement::For<3, cuda_thread_x_loop,  // z
-              statement::For<1, seq_exec,  // d
-                statement::Lambda<0>
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
-                                   RAJA::TypedRangeSegment<ID>(0, num_d),
-                                   RAJA::TypedRangeSegment<IG>(0, num_g),
-                                   RAJA::TypedRangeSegment<IZ>(0, num_z));
-
-  RAJA::Timer timer;
-  cudaErrchk( cudaDeviceSynchronize() );
-  timer.start();
-
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel<EXECPOL>( segments,
-    [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z) {
-       phi(m, g, z) += L(m, d) * psi(d, g, z);
-    }
-  );
+  {
+    std::cout << "\n Running RAJA CUDA version of LTimes...\n";
+
+    std::memset(phi_data, 0, phi_size * sizeof(double));
+
+    double* dL_data   = nullptr;
+    double* dpsi_data = nullptr;
+    double* dphi_data = nullptr;
+
+    cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+
+    std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+    PsiView psi(dpsi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+
+    std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+    PhiView phi(dphi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+
+    using EXECPOL =
+        RAJA::KernelPolicy<statement::CudaKernelAsync<statement::For<
+            0, cuda_block_x_loop,  // m
+            statement::For<
+                2, cuda_block_y_loop,                       // g
+                statement::For<3, cuda_thread_x_loop,       // z
+                               statement::For<1, seq_exec,  // d
+                                              statement::Lambda<0>>>>>>>;
+
+    auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                                     RAJA::TypedRangeSegment<ID>(0, num_d),
+                                     RAJA::TypedRangeSegment<IG>(0, num_g),
+                                     RAJA::TypedRangeSegment<IZ>(0, num_z));
+
+    RAJA::Timer timer;
+    cudaErrchk(cudaDeviceSynchronize());
+    timer.start();
 
-  cudaErrchk( cudaDeviceSynchronize() );
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA CUDA version of LTimes run time (sec.): "
-            << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel<EXECPOL>(segments, [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z)
+                            { phi(m, g, z) += L(m, d) * psi(d, g, z); });
 
+    cudaErrchk(cudaDeviceSynchronize());
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA CUDA version of LTimes run time (sec.): "
+              << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
-  cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double),
-                          cudaMemcpyDeviceToHost ) );
 
-  cudaErrchk( cudaFree( dL_data ) );
-  cudaErrchk( cudaFree( dpsi_data ) );
-  cudaErrchk( cudaFree( dphi_data ) );
+    cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double),
+                          cudaMemcpyDeviceToHost));
 
-  // Reset data in Views to CPU data
-  L.set_data(L_data);
-  psi.set_data(psi_data);
-  phi.set_data(phi_data);
+    cudaErrchk(cudaFree(dL_data));
+    cudaErrchk(cudaFree(dpsi_data));
+    cudaErrchk(cudaFree(dphi_data));
+
+    // Reset data in Views to CPU data
+    L.set_data(L_data);
+    psi.set_data(psi_data);
+    phi.set_data(phi_data);
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
 
-
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_CUDA_TEAMS
-{
-  std::cout << "\n Running RAJA CUDA Teams version of LTimes...\n";
+  {
+    std::cout << "\n Running RAJA CUDA Teams version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  double* dL_data   = nullptr;
-  double* dpsi_data = nullptr;
-  double* dphi_data = nullptr;
+    double* dL_data   = nullptr;
+    double* dpsi_data = nullptr;
+    double* dphi_data = nullptr;
 
-  cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
+    cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
 
 
-  using pol_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true, 512> >;
-  using pol_g = RAJA::LoopPolicy<RAJA::seq_exec, cuda_block_x_loop>;
-  using pol_z = RAJA::LoopPolicy<RAJA::seq_exec, cuda_thread_y_loop>;
-  using pol_m = RAJA::LoopPolicy<RAJA::seq_exec, cuda_thread_x_loop>;
-  using pol_d = RAJA::LoopPolicy<RAJA::seq_exec, RAJA::seq_exec>;
+    using pol_launch =
+        RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true, 512>>;
+    using pol_g = RAJA::LoopPolicy<RAJA::seq_exec, cuda_block_x_loop>;
+    using pol_z = RAJA::LoopPolicy<RAJA::seq_exec, cuda_thread_y_loop>;
+    using pol_m = RAJA::LoopPolicy<RAJA::seq_exec, cuda_thread_x_loop>;
+    using pol_d = RAJA::LoopPolicy<RAJA::seq_exec, RAJA::seq_exec>;
 
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
 
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(dL_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(dpsi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+    PsiView psi(dpsi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView phi(dphi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+    PhiView phi(dphi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
 
-  RAJA::Timer timer;
-  cudaErrchk( cudaDeviceSynchronize() );
-  timer.start();
+    RAJA::Timer timer;
+    cudaErrchk(cudaDeviceSynchronize());
+    timer.start();
 
 
-  for (int iter = 0;iter < num_iter;++ iter){
-    RAJA::launch<pol_launch>(
-        RAJA::ExecPlace::DEVICE,
-        RAJA::LaunchParams(RAJA::Teams(160, 1, 1),
-                              RAJA::Threads(8, 64, 1)),
-        [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
+    for (int iter = 0; iter < num_iter; ++iter)
     {
-      RAJA::loop<pol_g>(ctx, RAJA::TypedRangeSegment<IG>(0, num_g), [&](IG g){
-        RAJA::loop<pol_z>(ctx, RAJA::TypedRangeSegment<IZ>(0, num_z), [&](IZ z){
-          RAJA::loop<pol_m>(ctx, RAJA::TypedRangeSegment<IM>(0, num_m), [&](IM m){
-
-            double acc = phi(m, g, z);
-
-            RAJA::loop<pol_d>(ctx, RAJA::TypedRangeSegment<ID>(0, num_d), [&](ID d){
-
-              acc += L(m, d) * psi(d, g, z);
-
-
-            });
-
-            phi(m,g,z) = acc;
+      RAJA::launch<pol_launch>(
+          RAJA::ExecPlace::DEVICE,
+          RAJA::LaunchParams(RAJA::Teams(160, 1, 1), RAJA::Threads(8, 64, 1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+          {
+            RAJA::loop<pol_g>(
+                ctx, RAJA::TypedRangeSegment<IG>(0, num_g),
+                [&](IG g)
+                {
+                  RAJA::loop<pol_z>(
+                      ctx, RAJA::TypedRangeSegment<IZ>(0, num_z),
+                      [&](IZ z)
+                      {
+                        RAJA::loop<pol_m>(
+                            ctx, RAJA::TypedRangeSegment<IM>(0, num_m),
+                            [&](IM m)
+                            {
+                              double acc = phi(m, g, z);
+
+                              RAJA::loop<pol_d>(
+                                  ctx, RAJA::TypedRangeSegment<ID>(0, num_d),
+                                  [&](ID d)
+                                  {
+                                    acc += L(m, d) * psi(d, g, z);
+                                  });
+
+                              phi(m, g, z) = acc;
+                            });
+                      });
+                });
           });
-        });
-      });
-
-    });
-
-  }
-  cudaErrchk( cudaDeviceSynchronize() );
+    }
+    cudaErrchk(cudaDeviceSynchronize());
 
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA CUDA Teams version of LTimes run time (sec.): "
-            << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA CUDA Teams version of LTimes run time (sec.): "
+              << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
-  cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double),
-                          cudaMemcpyDeviceToHost ) );
+    cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double),
+                          cudaMemcpyDeviceToHost));
 
-  cudaErrchk( cudaFree( dL_data ) );
-  cudaErrchk( cudaFree( dpsi_data ) );
-  cudaErrchk( cudaFree( dphi_data ) );
+    cudaErrchk(cudaFree(dL_data));
+    cudaErrchk(cudaFree(dpsi_data));
+    cudaErrchk(cudaFree(dphi_data));
 
-  // Reset data in Views to CPU data
-  L.set_data(L_data);
-  psi.set_data(psi_data);
-  phi.set_data(phi_data);
+    // Reset data in Views to CPU data
+    L.set_data(L_data);
+    psi.set_data(psi_data);
+    phi.set_data(phi_data);
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #ifdef __CUDA_ARCH__
 #define RAJA_GET_POLICY(POL) typename POL::device_policy_t
@@ -1303,732 +1253,711 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 
 #if VARIANT_CUDA_TEAMS_MATRIX
-{
-  std::cout << "\n Running RAJA CUDA Teams+Matrix version of LTimes...\n";
+  {
+    std::cout << "\n Running RAJA CUDA Teams+Matrix version of LTimes...\n";
 
-  std::memset(phi_data, 0, phi_size * sizeof(double));
+    std::memset(phi_data, 0, phi_size * sizeof(double));
 
-  double* dL_data   = nullptr;
-  double* dpsi_data = nullptr;
-  double* dphi_data = nullptr;
+    double* dL_data   = nullptr;
+    double* dpsi_data = nullptr;
+    double* dphi_data = nullptr;
 
-  cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
+    cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
 
 
-  using matrix_layout = RowMajorLayout;
+    using matrix_layout = RowMajorLayout;
 
-  using L_matrix_host_t = RAJA::expt::SquareMatrixRegister<double, matrix_layout>;
-  using L_matrix_device_t = RAJA::expt::RectMatrixRegister<double, matrix_layout, 8, 4, RAJA::expt::cuda_warp_register>;
-  using L_matrix_hd_t = RAJA::LaunchPolicy<L_matrix_host_t, L_matrix_device_t>;
+    using L_matrix_host_t =
+        RAJA::expt::SquareMatrixRegister<double, matrix_layout>;
+    using L_matrix_device_t =
+        RAJA::expt::RectMatrixRegister<double, matrix_layout, 8, 4,
+                                       RAJA::expt::cuda_warp_register>;
+    using L_matrix_hd_t =
+        RAJA::LaunchPolicy<L_matrix_host_t, L_matrix_device_t>;
 
-  using phi_matrix_host_t = RAJA::expt::SquareMatrixRegister<double, matrix_layout>;
-  using phi_matrix_device_t = RAJA::expt::RectMatrixRegister<double, matrix_layout, 8, 8, RAJA::expt::cuda_warp_register>;
-  using phi_matrix_hd_t = RAJA::LaunchPolicy<L_matrix_host_t, phi_matrix_device_t>;
+    using phi_matrix_host_t =
+        RAJA::expt::SquareMatrixRegister<double, matrix_layout>;
+    using phi_matrix_device_t =
+        RAJA::expt::RectMatrixRegister<double, matrix_layout, 8, 8,
+                                       RAJA::expt::cuda_warp_register>;
+    using phi_matrix_hd_t =
+        RAJA::LaunchPolicy<L_matrix_host_t, phi_matrix_device_t>;
 
-  using psi_matrix_host_t = RAJA::expt::SquareMatrixRegister<double, matrix_layout>;
-  using psi_matrix_device_t = RAJA::expt::RectMatrixRegister<double, matrix_layout, 4, 8, RAJA::expt::cuda_warp_register>;
-  using psi_matrix_hd_t = RAJA::LaunchPolicy<L_matrix_host_t, psi_matrix_device_t>;
+    using psi_matrix_host_t =
+        RAJA::expt::SquareMatrixRegister<double, matrix_layout>;
+    using psi_matrix_device_t =
+        RAJA::expt::RectMatrixRegister<double, matrix_layout, 4, 8,
+                                       RAJA::expt::cuda_warp_register>;
+    using psi_matrix_hd_t =
+        RAJA::LaunchPolicy<L_matrix_host_t, psi_matrix_device_t>;
 
 
-  using pol_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true , 1024> >;
-  using pol_g = RAJA::LoopPolicy<RAJA::seq_exec, cuda_block_x_direct>;
-  using pol_z = RAJA::LoopPolicy<RAJA::seq_exec, cuda_thread_y_loop>;
+    using pol_launch =
+        RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true, 1024>>;
+    using pol_g = RAJA::LoopPolicy<RAJA::seq_exec, cuda_block_x_direct>;
+    using pol_z = RAJA::LoopPolicy<RAJA::seq_exec, cuda_thread_y_loop>;
 
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 0>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 0>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 0>, IM, IG, IZ>;
 
-  std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
-  LView L(dL_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+    std::array<RAJA::idx_t, 2> L_perm {{1, 0}};
+    LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{1, 2, 0}};
-  PsiView psi(dpsi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+    std::array<RAJA::idx_t, 3> psi_perm {{1, 2, 0}};
+    PsiView psi(dpsi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{1, 2, 0}};
-  PhiView phi(dphi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+    std::array<RAJA::idx_t, 3> phi_perm {{1, 2, 0}};
+    PhiView phi(dphi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
 
 
-  RAJA::Timer timer;
-  cudaErrchk( cudaDeviceSynchronize() );
-  timer.start();
+    RAJA::Timer timer;
+    cudaErrchk(cudaDeviceSynchronize());
+    timer.start();
 
-  auto seg_g = RAJA::TypedRangeSegment<IG>(0, num_g);
-  auto seg_z = RAJA::TypedRangeSegment<IZ>(0, num_z);
-  auto seg_m = RAJA::TypedRangeSegment<IM>(0, num_m);
-  auto seg_d = RAJA::TypedRangeSegment<ID>(0, num_d);
+    auto seg_g = RAJA::TypedRangeSegment<IG>(0, num_g);
+    auto seg_z = RAJA::TypedRangeSegment<IZ>(0, num_z);
+    auto seg_m = RAJA::TypedRangeSegment<IM>(0, num_m);
+    auto seg_d = RAJA::TypedRangeSegment<ID>(0, num_d);
 
-  printf("num_iter=%d\n", (int)num_iter);
-  for (int iter = 0;iter < num_iter;++ iter){
-    RAJA::launch<pol_launch>(
-        RAJA::ExecPlace::DEVICE,
-        RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1),
-                              RAJA::Threads(32, 32, 1)),
-        [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
+    printf("num_iter=%d\n", (int)num_iter);
+    for (int iter = 0; iter < num_iter; ++iter)
     {
+      RAJA::launch<pol_launch>(
+          RAJA::ExecPlace::DEVICE,
+          RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1),
+                             RAJA::Threads(32, 32, 1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+          {
+            using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t);
+            using L_RowM     = RAJA::expt::RowIndex<IM, L_matrix_t>;
+            using L_ColD     = RAJA::expt::ColIndex<ID, L_matrix_t>;
+
+            using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t);
+            using psi_RowD     = RAJA::expt::RowIndex<ID, psi_matrix_t>;
+            using psi_ColZ     = RAJA::expt::ColIndex<IZ, psi_matrix_t>;
+
+            using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t);
+            using phi_RowM     = RAJA::expt::RowIndex<IM, phi_matrix_t>;
+            using phi_ColZ     = RAJA::expt::ColIndex<IZ, phi_matrix_t>;
+
+
+            RAJA::loop<pol_g>(ctx, RAJA::TypedRangeSegment<IG>(0, num_g),
+                              [&](IG g)
+                              {
+                                RAJA::tile<pol_z>(
+                                    ctx, 32,
+                                    RAJA::TypedRangeSegment<int>(0, num_z),
+                                    [&](RAJA::TypedRangeSegment<int> tzi)
+                                    {
+                                      RAJA::TypedRangeSegment<IZ> tz(
+                                          *tzi.begin(), *tzi.end());
+
+                                      phi(phi_RowM::all(), g, phi_ColZ(tz)) +=
+                                          L(L_RowM::all(), L_ColD::all()) *
+                                          psi(psi_RowD::all(), g, psi_ColZ(tz));
+                                    });
+                              });
+          });
+    }
+    cudaErrchk(cudaDeviceSynchronize());
 
-
-      using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t);
-      using L_RowM = RAJA::expt::RowIndex<IM, L_matrix_t>;
-      using L_ColD = RAJA::expt::ColIndex<ID, L_matrix_t>;
-
-      using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t);
-      using psi_RowD = RAJA::expt::RowIndex<ID, psi_matrix_t>;
-      using psi_ColZ = RAJA::expt::ColIndex<IZ, psi_matrix_t>;
-
-      using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t);
-      using phi_RowM = RAJA::expt::RowIndex<IM, phi_matrix_t>;
-      using phi_ColZ = RAJA::expt::ColIndex<IZ, phi_matrix_t>;
-
-
-      RAJA::loop<pol_g>(ctx, RAJA::TypedRangeSegment<IG>(0, num_g), [&](IG g){
-
-        RAJA::tile<pol_z>(ctx, 32, RAJA::TypedRangeSegment<int>(0, num_z), [&](RAJA::TypedRangeSegment<int> tzi){
-
-          RAJA::TypedRangeSegment<IZ> tz(*tzi.begin(), *tzi.end());
-
-          phi(phi_RowM::all(), g, phi_ColZ(tz)) +=
-              L(L_RowM::all(), L_ColD::all()) * psi(psi_RowD::all(), g, psi_ColZ(tz));
-
-        });
-      });
-
-    });
-
-  }
-  cudaErrchk( cudaDeviceSynchronize() );
-
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA CUDA Teams+Matrix version of LTimes run time (sec.): "
-            << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA CUDA Teams+Matrix version of LTimes run time (sec.): "
+              << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
-  cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double),
-                          cudaMemcpyDeviceToHost ) );
+    cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double),
+                          cudaMemcpyDeviceToHost));
 
-  cudaErrchk( cudaFree( dL_data ) );
-  cudaErrchk( cudaFree( dpsi_data ) );
-  cudaErrchk( cudaFree( dphi_data ) );
+    cudaErrchk(cudaFree(dL_data));
+    cudaErrchk(cudaFree(dpsi_data));
+    cudaErrchk(cudaFree(dphi_data));
 
-  // Reset data in Views to CPU data
-  L.set_data(L_data);
-  psi.set_data(psi_data);
-  phi.set_data(phi_data);
+    // Reset data in Views to CPU data
+    L.set_data(L_data);
+    psi.set_data(psi_data);
+    phi.set_data(phi_data);
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if VARIANT_CUDA_KERNEL_SHMEM
-{
-  std::cout << "\n Running RAJA CUDA + shmem version of LTimes...\n";
-
-  std::memset(phi_data, 0, phi_size * sizeof(double));
-
-  double* dL_data   = nullptr;
-  double* dpsi_data = nullptr;
-  double* dphi_data = nullptr;
-
-  cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
-  cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) );
-  cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double),
-                          cudaMemcpyHostToDevice ) );
+  {
+    std::cout << "\n Running RAJA CUDA + shmem version of LTimes...\n";
+
+    std::memset(phi_data, 0, phi_size * sizeof(double));
+
+    double* dL_data   = nullptr;
+    double* dpsi_data = nullptr;
+    double* dphi_data = nullptr;
+
+    cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double)));
+    cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double),
+                          cudaMemcpyHostToDevice));
+
+
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+
+    std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+    PsiView psi(dpsi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+
+    std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+    PhiView phi(dphi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+
+
+    static const int tile_m = 25;
+    static const int tile_d = 90;
+    static const int tile_g = 0;
+    static const int tile_z = 40;
+
+
+    //
+    // Define statically dimensioned local arrays used in kernel
+    //
+
+    using shmem_L_t =
+        RAJA::TypedLocalArray<double, RAJA::PERM_IJ,
+                              RAJA::SizeList<tile_m, tile_d>, IM, ID>;
+    shmem_L_t shmem_L;
+
+
+    using shmem_psi_t =
+        RAJA::TypedLocalArray<double, RAJA::PERM_IJK,
+                              RAJA::SizeList<tile_d, tile_g, tile_z>, ID, IG,
+                              IZ>;
+    shmem_psi_t shmem_psi;
+
+
+    //
+    // Define our execution policy
+    //
+
+    using RAJA::Offsets;
+    using RAJA::Params;
+    using RAJA::Segs;
+
+    using EXECPOL =
+        RAJA::KernelPolicy<statement::CudaKernelAsync<statement::InitLocalMem<
+            cuda_shared_mem, ParamList<0, 1>,
+            // Tile outer m,d loops
+            statement::Tile<
+                0, tile_fixed<tile_m>, seq_exec,  // m
+                statement::Tile<
+                    1, tile_fixed<tile_d>, seq_exec,  // d
+
+                    // Load L for m,d tile into shmem
+                    statement::For<1, cuda_thread_x_loop,                   // d
+                                   statement::For<0, cuda_thread_y_direct,  // m
+                                                  statement::Lambda<
+                                                      0, Segs<0, 1>, Params<0>,
+                                                      Offsets<0, 1>>>>,
+                    statement::CudaSyncThreads,
+
+                    // Distribute g, z across blocks and tile z
+                    statement::For<
+                        2, cuda_block_y_loop,  // g
+                        statement::Tile<
+                            3, tile_fixed<tile_z>,
+                            cuda_block_x_loop,  // z
+
+                            // Load phi into thread local storage
+                            statement::For<
+                                3, cuda_thread_x_direct,  // z
+                                statement::For<
+                                    0, cuda_thread_y_direct,  // m
+                                    statement::Lambda<2, Segs<0, 2, 3>,
+                                                      Params<2>>>>,
+
+                            // Load slice of psi into shmem
+                            statement::For<
+                                3, cuda_thread_x_direct,               // z
+                                statement::For<1, cuda_thread_y_loop,  // d
+                                                                       // (reusing
+                                                                       // y)
+                                               statement::Lambda<
+                                                   1, Segs<1, 2, 3>, Params<1>,
+                                                   Offsets<1, 2, 3>>>>,
+                            statement::CudaSyncThreads,
+
+                            // Compute phi
+                            statement::For<
+                                3, cuda_thread_x_direct,  // z
+                                statement::For<
+                                    0, cuda_thread_y_direct,  // m
+
+                                    // Compute thread-local Phi value and store
+                                    statement::For<
+                                        1, seq_exec,  // d
+                                        statement::Lambda<
+                                            3, Segs<0, 1, 2, 3>,
+                                            Params<0, 1, 2>,
+                                            Offsets<0, 1, 2, 3>>>  // d
+                                    >                              // m
+                                >,                                 // z
+
+                            // finish tile over directions
+                            statement::CudaSyncThreads,
+
+                            // Write out phi from thread local storage
+                            statement::For<
+                                3, cuda_thread_x_direct,  // z
+                                statement::For<
+                                    0, cuda_thread_y_direct,  // m
+                                    statement::Lambda<4, Segs<0, 2, 3>,
+                                                      Params<2>>>>,
+                            statement::CudaSyncThreads
+
+                            >  // Tile z
+                        >      // g
+
+                    >                                    // Tile d
+                >                                        // Tile m
+            >                                            // init shmem
+                                                      >  // CudaKernelAsync
+
+                           >;  // KernelPolicy
 
 
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
-
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
-
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
-
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(dL_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
-
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(dpsi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
-
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView phi(dphi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
-
-
-  static const int tile_m = 25;
-  static const int tile_d = 90;
-  static const int tile_g = 0;
-  static const int tile_z = 40;
-
-
-
-
-  //
-  // Define statically dimensioned local arrays used in kernel
-  //
-
-  using shmem_L_t = RAJA::TypedLocalArray<double,
-                        RAJA::PERM_IJ,
-                        RAJA::SizeList<tile_m, tile_d>,
-                        IM, ID>;
-  shmem_L_t shmem_L;
-
-
-  using shmem_psi_t = RAJA::TypedLocalArray<double,
-                        RAJA::PERM_IJK,
-                        RAJA::SizeList<tile_d, tile_g, tile_z>,
-                        ID, IG, IZ>;
-  shmem_psi_t shmem_psi;
-
-
-
-  //
-  // Define our execution policy
-  //
-
-  using RAJA::Segs;
-  using RAJA::Params;
-  using RAJA::Offsets;
-
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::CudaKernelAsync<
-        statement::InitLocalMem<cuda_shared_mem, ParamList<0,1>,
-          // Tile outer m,d loops
-          statement::Tile<0, tile_fixed<tile_m>, seq_exec,  // m
-            statement::Tile<1, tile_fixed<tile_d>, seq_exec,  // d
-
-              // Load L for m,d tile into shmem
-              statement::For<1, cuda_thread_x_loop,  // d
-                statement::For<0, cuda_thread_y_direct,   // m
-                  statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>>
-                >
-              >,
-              statement::CudaSyncThreads,
-
-              // Distribute g, z across blocks and tile z
-              statement::For<2, cuda_block_y_loop, // g
-                statement::Tile<3, tile_fixed<tile_z>, cuda_block_x_loop,  // z
-
-                  // Load phi into thread local storage
-                  statement::For<3, cuda_thread_x_direct,  // z
-                    statement::For<0, cuda_thread_y_direct, // m
-                      statement::Lambda<2, Segs<0,2,3>, Params<2>>
-                    >
-                  >,
-
-                  // Load slice of psi into shmem
-                  statement::For<3,cuda_thread_x_direct,  // z
-                    statement::For<1, cuda_thread_y_loop, // d (reusing y)
-                      statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>>
-                    >
-                  >,
-                  statement::CudaSyncThreads,
-
-                  // Compute phi
-                  statement::For<3, cuda_thread_x_direct,  // z
-                    statement::For<0, cuda_thread_y_direct, // m
-
-                      // Compute thread-local Phi value and store
-                      statement::For<1,  seq_exec,  // d
-                        statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>>
-                      > // d
-                    >  // m
-                  >,  // z
-
-                  // finish tile over directions
-                  statement::CudaSyncThreads,
-
-                  // Write out phi from thread local storage
-                  statement::For<3, cuda_thread_x_direct,  // z
-                    statement::For<0, cuda_thread_y_direct, // m
-                      statement::Lambda<4, Segs<0,2,3>, Params<2>>
-                    >
-                  >,
-                  statement::CudaSyncThreads
+    RAJA::Timer timer;
+    cudaErrchk(cudaDeviceSynchronize());
+    timer.start();
 
-                >  // Tile z
-              >  // g
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel_param<EXECPOL>(
+          RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                           RAJA::TypedRangeSegment<ID>(0, num_d),
+                           RAJA::TypedRangeSegment<IG>(0, num_g),
+                           RAJA::TypedRangeSegment<IZ>(0, num_z)),
 
-            >  // Tile d
-          >  // Tile m
-        > // init shmem
-      >  // CudaKernelAsync
-
-    >;  // KernelPolicy
+          // For kernel_param, second arg is a tuple of data objects used in
+          // lambdas. They are the last args in all lambdas (after indices).
+          // Here, the last entry '0.0' yields a thread-private temporary for
+          // computing a phi value, for shared memory before writing to phi
+          // array.
+          RAJA::make_tuple(shmem_L, shmem_psi, 0.0),
 
+          // Lambda<0> : Load L into shmem
+          [=] RAJA_DEVICE(IM m, ID d, shmem_L_t & sh_L, IM tm, ID td)
+          { sh_L(tm, td) = L(m, d); },
 
+          // Lambda<1> : Load slice of psi into shmem
+          [=] RAJA_DEVICE(ID d, IG g, IZ z, shmem_psi_t & sh_psi, ID td, IG tg,
+                          IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); },
 
+          // Lambda<2> : Load thread-local phi value
+          [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local)
+          { phi_local = phi(m, g, z); },
 
+          // Lambda<3> Compute thread-local phi value
+          [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z, shmem_L_t & sh_L,
+                          shmem_psi_t & sh_psi, double& phi_local, IM tm, ID td,
+                          IG tg, IZ tz)
+          { phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); },
 
-  RAJA::Timer timer;
-  cudaErrchk( cudaDeviceSynchronize() );
-  timer.start();
-
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel_param<EXECPOL>(
-      RAJA::make_tuple(
-      RAJA::TypedRangeSegment<IM>(0, num_m),
-      RAJA::TypedRangeSegment<ID>(0, num_d),
-      RAJA::TypedRangeSegment<IG>(0, num_g),
-      RAJA::TypedRangeSegment<IZ>(0, num_z)),
-
-    // For kernel_param, second arg is a tuple of data objects used in lambdas.
-    // They are the last args in all lambdas (after indices).
-    // Here, the last entry '0.0' yields a thread-private temporary for
-    // computing a phi value, for shared memory before writing to phi array.
-    RAJA::make_tuple( shmem_L,
-                      shmem_psi,
-                      0.0),
-
-    // Lambda<0> : Load L into shmem
-    [=] RAJA_DEVICE (IM m, ID d,
-                     shmem_L_t& sh_L,
-                     IM tm, ID td) {
-      sh_L(tm, td) = L(m, d);
-    },
-
-    // Lambda<1> : Load slice of psi into shmem
-    [=] RAJA_DEVICE (ID d, IG g, IZ z,
-                     shmem_psi_t& sh_psi,
-                     ID td, IG tg, IZ tz) {
-
-      sh_psi(td, tg, tz) = psi(d, g, z);
-    },
-
-    // Lambda<2> : Load thread-local phi value
-    [=] RAJA_DEVICE (IM m, IG g, IZ z,
-                     double& phi_local) {
-
-      phi_local = phi(m, g, z);
-    },
-
-    // Lambda<3> Compute thread-local phi value
-    [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z,
-                     shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local,
-                     IM tm, ID td, IG tg, IZ tz) {
-
-      phi_local += sh_L(tm, td) *  sh_psi(td, tg, tz);
-    },
-
-    // Lambda<4> : Store phi
-    [=] RAJA_DEVICE (IM m, IG g, IZ z,
-                     double& phi_local) {
-
-      phi(m, g, z) = phi_local;
-    }
-
-  );
+          // Lambda<4> : Store phi
+          [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local)
+          { phi(m, g, z) = phi_local; }
 
-  cudaDeviceSynchronize();
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA CUDA + shmem version of LTimes run time (sec.): "
-            << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl;
+      );
 
+    cudaDeviceSynchronize();
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA CUDA + shmem version of LTimes run time (sec.): "
+              << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
 #if defined(DEBUG_LTIMES)
 
-  cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double),
-                          cudaMemcpyDeviceToHost ) );
+    cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double),
+                          cudaMemcpyDeviceToHost));
 
-  // Reset data in Views to CPU data
-  L.set_data(L_data);
-  psi.set_data(psi_data);
-  phi.set_data(phi_data);
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    // Reset data in Views to CPU data
+    L.set_data(L_data);
+    psi.set_data(psi_data);
+    phi.set_data(phi_data);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
 
 
-  cudaErrchk( cudaFree( dL_data ) );
-  cudaErrchk( cudaFree( dpsi_data ) );
-  cudaErrchk( cudaFree( dphi_data ) );
-}
+    cudaErrchk(cudaFree(dL_data));
+    cudaErrchk(cudaFree(dpsi_data));
+    cudaErrchk(cudaFree(dphi_data));
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if RAJA_HIP_KERNEL
-{
-  std::cout << "\n Running RAJA HIP version of LTimes...\n";
-
-  std::memset(phi_data, 0, phi_size * sizeof(double));
-
-  double* dL_data   = nullptr;
-  double* dpsi_data = nullptr;
-  double* dphi_data = nullptr;
-
-  hipErrchk( hipMalloc( (void**)&dL_data, L_size * sizeof(double) ) );
-  hipErrchk( hipMemcpy( dL_data, L_data, L_size * sizeof(double),
-                          hipMemcpyHostToDevice ) );
-  hipErrchk( hipMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) );
-  hipErrchk( hipMemcpy( dpsi_data, psi_data, psi_size * sizeof(double),
-                          hipMemcpyHostToDevice ) );
-  hipErrchk( hipMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) );
-  hipErrchk( hipMemcpy( dphi_data, phi_data, phi_size * sizeof(double),
-                          hipMemcpyHostToDevice ) );
-
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
-
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
-
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
-
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(dL_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
-
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(dpsi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
-
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView phi(dphi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
-
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::HipKernelAsync<
-        statement::For<0, hip_block_x_loop,  // m
-          statement::For<2, hip_block_y_loop,  // g
-            statement::For<3, hip_thread_x_loop,  // z
-              statement::For<1, seq_exec,  // d
-                statement::Lambda<0>
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
-                                   RAJA::TypedRangeSegment<ID>(0, num_d),
-                                   RAJA::TypedRangeSegment<IG>(0, num_g),
-                                   RAJA::TypedRangeSegment<IZ>(0, num_z));
-
-  RAJA::Timer timer;
-  hipErrchk( hipDeviceSynchronize() );
-  timer.start();
-
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel<EXECPOL>( segments,
-    [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z) {
-       phi(m, g, z) += L(m, d) * psi(d, g, z);
-    }
-  );
+  {
+    std::cout << "\n Running RAJA HIP version of LTimes...\n";
+
+    std::memset(phi_data, 0, phi_size * sizeof(double));
+
+    double* dL_data   = nullptr;
+    double* dpsi_data = nullptr;
+    double* dphi_data = nullptr;
+
+    hipErrchk(hipMalloc((void**)&dL_data, L_size * sizeof(double)));
+    hipErrchk(hipMemcpy(dL_data, L_data, L_size * sizeof(double),
+                        hipMemcpyHostToDevice));
+    hipErrchk(hipMalloc((void**)&dpsi_data, psi_size * sizeof(double)));
+    hipErrchk(hipMemcpy(dpsi_data, psi_data, psi_size * sizeof(double),
+                        hipMemcpyHostToDevice));
+    hipErrchk(hipMalloc((void**)&dphi_data, phi_size * sizeof(double)));
+    hipErrchk(hipMemcpy(dphi_data, phi_data, phi_size * sizeof(double),
+                        hipMemcpyHostToDevice));
+
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+
+    std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+    PsiView psi(dpsi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+
+    std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+    PhiView phi(dphi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+
+    using EXECPOL = RAJA::KernelPolicy<statement::HipKernelAsync<statement::For<
+        0, hip_block_x_loop,                                       // m
+        statement::For<2, hip_block_y_loop,                        // g
+                       statement::For<3, hip_thread_x_loop,        // z
+                                      statement::For<1, seq_exec,  // d
+                                                     statement::Lambda<0>>>>>>>;
+
+    auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                                     RAJA::TypedRangeSegment<ID>(0, num_d),
+                                     RAJA::TypedRangeSegment<IG>(0, num_g),
+                                     RAJA::TypedRangeSegment<IZ>(0, num_z));
 
-  hipErrchk( hipDeviceSynchronize() );
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA HIP version of LTimes run time (sec.): "
-            << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    RAJA::Timer timer;
+    hipErrchk(hipDeviceSynchronize());
+    timer.start();
+
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel<EXECPOL>(segments, [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z)
+                            { phi(m, g, z) += L(m, d) * psi(d, g, z); });
 
-  hipErrchk( hipMemcpy( phi_data, dphi_data, phi_size * sizeof(double),
-                          hipMemcpyDeviceToHost ) );
+    hipErrchk(hipDeviceSynchronize());
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA HIP version of LTimes run time (sec.): "
+              << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
-  hipErrchk( hipFree( dL_data ) );
-  hipErrchk( hipFree( dpsi_data ) );
-  hipErrchk( hipFree( dphi_data ) );
+    hipErrchk(hipMemcpy(phi_data, dphi_data, phi_size * sizeof(double),
+                        hipMemcpyDeviceToHost));
 
-  // Reset data in Views to CPU data
-  L.set_data(L_data);
-  psi.set_data(psi_data);
-  phi.set_data(phi_data);
+    hipErrchk(hipFree(dL_data));
+    hipErrchk(hipFree(dpsi_data));
+    hipErrchk(hipFree(dphi_data));
+
+    // Reset data in Views to CPU data
+    L.set_data(L_data);
+    psi.set_data(psi_data);
+    phi.set_data(phi_data);
 
 #if defined(DEBUG_LTIMES)
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
-}
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if RAJA_HIP_KERNEL_SHMEM
-{
-  std::cout << "\n Running RAJA HIP + shmem version of LTimes...\n";
-
-  std::memset(phi_data, 0, phi_size * sizeof(double));
-
-  double* dL_data   = nullptr;
-  double* dpsi_data = nullptr;
-  double* dphi_data = nullptr;
-
-  hipErrchk( hipMalloc( (void**)&dL_data, L_size * sizeof(double) ) );
-  hipErrchk( hipMemcpy( dL_data, L_data, L_size * sizeof(double),
-                          hipMemcpyHostToDevice ) );
-  hipErrchk( hipMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) );
-  hipErrchk( hipMemcpy( dpsi_data, psi_data, psi_size * sizeof(double),
-                          hipMemcpyHostToDevice ) );
-  hipErrchk( hipMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) );
-  hipErrchk( hipMemcpy( dphi_data, phi_data, phi_size * sizeof(double),
-                          hipMemcpyHostToDevice ) );
-
-
-  //
-  // View types and Views/Layouts for indexing into arrays
-  //
-  // L(m, d) : 1 -> d is stride-1 dimension
-  using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
-
-  // psi(d, g, z) : 2 -> z is stride-1 dimension
-  using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
-
-  // phi(m, g, z) : 2 -> z is stride-1 dimension
-  using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
-
-  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
-  LView L(dL_data,
-          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
-
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(dpsi_data,
-              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
-
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView phi(dphi_data,
-              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
-
-
-  static const int tile_m = 25;
-  static const int tile_d = 90;
-  static const int tile_g = 0;
-  static const int tile_z = 40;
-
-
-
+  {
+    std::cout << "\n Running RAJA HIP + shmem version of LTimes...\n";
+
+    std::memset(phi_data, 0, phi_size * sizeof(double));
+
+    double* dL_data   = nullptr;
+    double* dpsi_data = nullptr;
+    double* dphi_data = nullptr;
+
+    hipErrchk(hipMalloc((void**)&dL_data, L_size * sizeof(double)));
+    hipErrchk(hipMemcpy(dL_data, L_data, L_size * sizeof(double),
+                        hipMemcpyHostToDevice));
+    hipErrchk(hipMalloc((void**)&dpsi_data, psi_size * sizeof(double)));
+    hipErrchk(hipMemcpy(dpsi_data, psi_data, psi_size * sizeof(double),
+                        hipMemcpyHostToDevice));
+    hipErrchk(hipMalloc((void**)&dphi_data, phi_size * sizeof(double)));
+    hipErrchk(hipMemcpy(dphi_data, phi_data, phi_size * sizeof(double),
+                        hipMemcpyHostToDevice));
+
+
+    //
+    // View types and Views/Layouts for indexing into arrays
+    //
+    // L(m, d) : 1 -> d is stride-1 dimension
+    using LView = TypedView<double, Layout<2, int, 1>, IM, ID>;
+
+    // psi(d, g, z) : 2 -> z is stride-1 dimension
+    using PsiView = TypedView<double, Layout<3, int, 2>, ID, IG, IZ>;
+
+    // phi(m, g, z) : 2 -> z is stride-1 dimension
+    using PhiView = TypedView<double, Layout<3, int, 2>, IM, IG, IZ>;
+
+    std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+    LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+
+    std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+    PsiView psi(dpsi_data,
+                RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+
+    std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+    PhiView phi(dphi_data,
+                RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+
+
+    static const int tile_m = 25;
+    static const int tile_d = 90;
+    static const int tile_g = 0;
+    static const int tile_z = 40;
+
+
+    //
+    // Define statically dimensioned local arrays used in kernel
+    //
+
+    using shmem_L_t =
+        RAJA::TypedLocalArray<double, RAJA::PERM_IJ,
+                              RAJA::SizeList<tile_m, tile_d>, IM, ID>;
+    shmem_L_t shmem_L;
+
+
+    using shmem_psi_t =
+        RAJA::TypedLocalArray<double, RAJA::PERM_IJK,
+                              RAJA::SizeList<tile_d, tile_g, tile_z>, ID, IG,
+                              IZ>;
+    shmem_psi_t shmem_psi;
+
+
+    //
+    // Define our execution policy
+    //
+
+    using RAJA::Offsets;
+    using RAJA::Params;
+    using RAJA::Segs;
+    using RAJA::statement::Param;
+
+    using EXECPOL =
+        RAJA::KernelPolicy<statement::HipKernelAsync<statement::InitLocalMem<
+            hip_shared_mem, ParamList<0, 1>,
+            // Tile outer m,d loops
+            statement::Tile<
+                0, tile_fixed<tile_m>, seq_exec,  // m
+                statement::Tile<
+                    1, tile_fixed<tile_d>, seq_exec,  // d
+
+                    // Load L for m,d tile into shmem
+                    statement::For<1, hip_thread_x_loop,                   // d
+                                   statement::For<0, hip_thread_y_direct,  // m
+                                                  statement::Lambda<
+                                                      0, Segs<0, 1>, Params<0>,
+                                                      Offsets<0, 1>>>>,
+                    statement::HipSyncThreads,
+
+                    // Distribute g, z across blocks and tile z
+                    statement::For<
+                        2, hip_block_y_loop,  // g
+                        statement::Tile<
+                            3, tile_fixed<tile_z>,
+                            hip_block_x_loop,  // z
+
+                            // Load phi into thread local storage
+                            statement::For<
+                                3, hip_thread_x_direct,  // z
+                                statement::For<
+                                    0, hip_thread_y_direct,  // m
+                                    statement::Lambda<2, Segs<0, 2, 3>,
+                                                      Params<2>>>>,
+
+                            // Load slice of psi into shmem
+                            statement::For<
+                                3, hip_thread_x_direct,               // z
+                                statement::For<1, hip_thread_y_loop,  // d
+                                                                      // (reusing
+                                                                      // y)
+                                               statement::Lambda<
+                                                   1, Segs<1, 2, 3>, Params<1>,
+                                                   Offsets<1, 2, 3>>>>,
+                            statement::HipSyncThreads,
+
+                            // Compute phi
+                            statement::For<
+                                3, hip_thread_x_direct,  // z
+                                statement::For<
+                                    0, hip_thread_y_direct,  // m
+
+                                    // Compute thread-local Phi value and store
+                                    statement::For<
+                                        1, seq_exec,  // d
+                                        statement::Lambda<
+                                            3, Segs<0, 1, 2, 3>,
+                                            Params<0, 1, 2>,
+                                            Offsets<0, 1, 2, 3>>>  // d
+                                    >                              // m
+                                >,                                 // z
+
+                            // finish tile over directions
+                            statement::HipSyncThreads,
+
+                            // Write out phi from thread local storage
+                            statement::For<
+                                3, hip_thread_x_direct,  // z
+                                statement::For<
+                                    0, hip_thread_y_direct,  // m
+                                    statement::Lambda<4, Segs<0, 2, 3>,
+                                                      Params<2>>>>,
+                            statement::HipSyncThreads
+
+                            >  // Tile z
+                        >      // g
+
+                    >                                   // Tile d
+                >                                       // Tile m
+            >                                           // init shmem
+                                                     >  // HipKernelAsync
+
+                           >;  // KernelPolicy
 
-  //
-  // Define statically dimensioned local arrays used in kernel
-  //
 
-  using shmem_L_t = RAJA::TypedLocalArray<double,
-                        RAJA::PERM_IJ,
-                        RAJA::SizeList<tile_m, tile_d>,
-                        IM, ID>;
-  shmem_L_t shmem_L;
-
-
-  using shmem_psi_t = RAJA::TypedLocalArray<double,
-                        RAJA::PERM_IJK,
-                        RAJA::SizeList<tile_d, tile_g, tile_z>,
-                        ID, IG, IZ>;
-  shmem_psi_t shmem_psi;
-
-
-
-  //
-  // Define our execution policy
-  //
-
-  using RAJA::statement::Param;
-  using RAJA::Segs;
-  using RAJA::Params;
-  using RAJA::Offsets;
-
-  using EXECPOL =
-    RAJA::KernelPolicy<
-      statement::HipKernelAsync<
-        statement::InitLocalMem<hip_shared_mem, ParamList<0,1>,
-          // Tile outer m,d loops
-          statement::Tile<0, tile_fixed<tile_m>, seq_exec,  // m
-            statement::Tile<1, tile_fixed<tile_d>, seq_exec,  // d
-
-              // Load L for m,d tile into shmem
-              statement::For<1, hip_thread_x_loop,  // d
-                statement::For<0, hip_thread_y_direct,   // m
-                  statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>>
-                >
-              >,
-              statement::HipSyncThreads,
-
-              // Distribute g, z across blocks and tile z
-              statement::For<2, hip_block_y_loop, // g
-                statement::Tile<3, tile_fixed<tile_z>, hip_block_x_loop,  // z
-
-                  // Load phi into thread local storage
-                  statement::For<3, hip_thread_x_direct,  // z
-                    statement::For<0, hip_thread_y_direct, // m
-                      statement::Lambda<2, Segs<0,2,3>, Params<2>>
-                    >
-                  >,
-
-                  // Load slice of psi into shmem
-                  statement::For<3, hip_thread_x_direct,  // z
-                    statement::For<1, hip_thread_y_loop, // d (reusing y)
-                      statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>>
-                    >
-                  >,
-                  statement::HipSyncThreads,
-
-                  // Compute phi
-                  statement::For<3, hip_thread_x_direct,  // z
-                    statement::For<0, hip_thread_y_direct, // m
-
-                      // Compute thread-local Phi value and store
-                      statement::For<1,  seq_exec,  // d
-                        statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>>
-                      > // d
-                    >  // m
-                  >,  // z
-
-                  // finish tile over directions
-                  statement::HipSyncThreads,
-
-                  // Write out phi from thread local storage
-                  statement::For<3, hip_thread_x_direct,  // z
-                    statement::For<0, hip_thread_y_direct, // m
-                      statement::Lambda<4, Segs<0,2,3>, Params<2>>
-                    >
-                  >,
-                  statement::HipSyncThreads
-
-                >  // Tile z
-              >  // g
-
-            >  // Tile d
-          >  // Tile m
-        > // init shmem
-      >  // HipKernelAsync
-
-    >;  // KernelPolicy
-
-
-
-
-  RAJA::Timer timer;
-  hipErrchk( hipDeviceSynchronize() );
-  timer.start();
-
-  for (int iter = 0;iter < num_iter;++ iter)
-  RAJA::kernel_param<EXECPOL>(
-      RAJA::make_tuple(
-      RAJA::TypedRangeSegment<IM>(0, num_m),
-      RAJA::TypedRangeSegment<ID>(0, num_d),
-      RAJA::TypedRangeSegment<IG>(0, num_g),
-      RAJA::TypedRangeSegment<IZ>(0, num_z)),
-
-    // For kernel_param, second arg is a tuple of data objects used in lambdas.
-    // They are the last args in all lambdas (after indices).
-    // Here, the last entry '0.0' yields a thread-private temporary for
-    // computing a phi value, for shared memory before writing to phi array.
-    RAJA::make_tuple( shmem_L,
-                      shmem_psi,
-                      0.0),
-
-    // Lambda<0> : Load L into shmem
-    [=] RAJA_DEVICE (IM m, ID d,
-                     shmem_L_t& sh_L,
-                     IM tm, ID td) {
-      sh_L(tm, td) = L(m, d);
-    },
-
-    // Lambda<1> : Load slice of psi into shmem
-    [=] RAJA_DEVICE (ID d, IG g, IZ z,
-                     shmem_psi_t& sh_psi,
-                     ID td, IG tg, IZ tz) {
-
-      sh_psi(td, tg, tz) = psi(d, g, z);
-    },
-
-    // Lambda<2> : Load thread-local phi value
-    [=] RAJA_DEVICE (IM m, IG g, IZ z,
-                     double& phi_local) {
-
-      phi_local = phi(m, g, z);
-    },
-
-    // Lambda<3> Compute thread-local phi value
-    [=] RAJA_DEVICE (IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d),
-                     IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z),
-                     shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local,
-                     IM tm, ID td, IG tg, IZ tz) {
-
-      phi_local += sh_L(tm, td) *  sh_psi(td, tg, tz);
-    },
-
-    // Lambda<4> : Store phi
-    [=] RAJA_DEVICE (IM m, IG g, IZ z,
-                     double& phi_local) {
-
-      phi(m, g, z) = phi_local;
-    }
-
-  );
-
-  hipDeviceSynchronize();
-  timer.stop();
-  double t = timer.elapsed();
-  double gflop_rate = total_flops / t / 1.0e9;
-  std::cout << "  RAJA HIP + shmem version of LTimes run time (sec.): "
-            << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl;
+    RAJA::Timer timer;
+    hipErrchk(hipDeviceSynchronize());
+    timer.start();
 
+    for (int iter = 0; iter < num_iter; ++iter)
+      RAJA::kernel_param<EXECPOL>(
+          RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                           RAJA::TypedRangeSegment<ID>(0, num_d),
+                           RAJA::TypedRangeSegment<IG>(0, num_g),
+                           RAJA::TypedRangeSegment<IZ>(0, num_z)),
+
+          // For kernel_param, second arg is a tuple of data objects used in
+          // lambdas. They are the last args in all lambdas (after indices).
+          // Here, the last entry '0.0' yields a thread-private temporary for
+          // computing a phi value, for shared memory before writing to phi
+          // array.
+          RAJA::make_tuple(shmem_L, shmem_psi, 0.0),
+
+          // Lambda<0> : Load L into shmem
+          [=] RAJA_DEVICE(IM m, ID d, shmem_L_t & sh_L, IM tm, ID td)
+          { sh_L(tm, td) = L(m, d); },
+
+          // Lambda<1> : Load slice of psi into shmem
+          [=] RAJA_DEVICE(ID d, IG g, IZ z, shmem_psi_t & sh_psi, ID td, IG tg,
+                          IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); },
+
+          // Lambda<2> : Load thread-local phi value
+          [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local)
+          { phi_local = phi(m, g, z); },
+
+          // Lambda<3> Compute thread-local phi value
+          [=] RAJA_DEVICE(IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d),
+                          IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z),
+                          shmem_L_t & sh_L, shmem_psi_t & sh_psi,
+                          double& phi_local, IM tm, ID td, IG tg, IZ tz)
+          { phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); },
+
+          // Lambda<4> : Store phi
+          [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local)
+          { phi(m, g, z) = phi_local; }
+
+      );
+
+    hipDeviceSynchronize();
+    timer.stop();
+    double t          = timer.elapsed();
+    double gflop_rate = total_flops / t / 1.0e9;
+    std::cout << "  RAJA HIP + shmem version of LTimes run time (sec.): "
+              << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl;
 
 
 #if defined(DEBUG_LTIMES)
 
-  hipErrchk( hipMemcpy( phi_data, dphi_data, phi_size * sizeof(double),
-                          hipMemcpyDeviceToHost ) );
+    hipErrchk(hipMemcpy(phi_data, dphi_data, phi_size * sizeof(double),
+                        hipMemcpyDeviceToHost));
 
-  // Reset data in Views to CPU data
-  L.set_data(L_data);
-  psi.set_data(psi_data);
-  phi.set_data(phi_data);
-  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+    // Reset data in Views to CPU data
+    L.set_data(L_data);
+    psi.set_data(psi_data);
+    phi.set_data(phi_data);
+    checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
 
 
-  hipErrchk( hipFree( dL_data ) );
-  hipErrchk( hipFree( dpsi_data ) );
-  hipErrchk( hipFree( dphi_data ) );
-}
+    hipErrchk(hipFree(dL_data));
+    hipErrchk(hipFree(dpsi_data));
+    hipErrchk(hipFree(dphi_data));
+  }
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
 
@@ -2039,36 +1968,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 // Function to check result and report P/F.
 //
 template <typename PHIVIEW_T, typename LVIEW_T, typename PSIVIEW_T>
-void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi,
+void checkResult(PHIVIEW_T& phi,
+                 LVIEW_T& L,
+                 PSIVIEW_T& psi,
                  const int num_m,
                  const int num_d,
                  const int num_g,
                  const int num_z)
 {
-  size_t nerrors = 0;
+  size_t nerrors     = 0;
   double total_error = 0.0;
 
-  for (IM m(0); m < num_m; ++m) {
-    for (IG g(0); g < num_g; ++g) {
-      for (IZ z(0); z < num_z; ++z) {
+  for (IM m(0); m < num_m; ++m)
+  {
+    for (IG g(0); g < num_g; ++g)
+    {
+      for (IZ z(0); z < num_z; ++z)
+      {
         double total = 0.0;
-        for (ID d(0); d < num_d; ++d) {
+        for (ID d(0); d < num_d; ++d)
+        {
           double val = L(m, d) * psi(d, g, z);
           total += val;
         }
-        if (std::abs(total-phi(m, g, z)) > 1e-9) {
-          printf("ERR: g=%d, z=%d, m=%d, val=%.12e, expected=%.12e\n",
-              (int)*g, (int)*z, (int)*m, phi(m,g,z), total);
+        if (std::abs(total - phi(m, g, z)) > 1e-9)
+        {
+          printf("ERR: g=%d, z=%d, m=%d, val=%.12e, expected=%.12e\n", (int)*g,
+                 (int)*z, (int)*m, phi(m, g, z), total);
           ++nerrors;
         }
-        total_error += std::abs(total-phi(m, g, z));
+        total_error += std::abs(total - phi(m, g, z));
       }
     }
   }
 
-  if ( nerrors == 0 ) {
+  if (nerrors == 0)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL : " << nerrors << " errors!\n";
   }
 }
diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp
index 331d6c51dd..3ed7dcd11d 100644
--- a/benchmark/raja_view_blur.cpp
+++ b/benchmark/raja_view_blur.cpp
@@ -15,117 +15,107 @@
  *
  */
 
-//Uncomment to specify variant
-//#define RUN_HIP_VARIANT
-//#define RUN_CUDA_VARIANT
-//#define RUN_SYCL_VARIANT
-//#define RUN_OPENMP_VARIANT
+// Uncomment to specify variant
+// #define RUN_HIP_VARIANT
+// #define RUN_CUDA_VARIANT
+// #define RUN_SYCL_VARIANT
+// #define RUN_OPENMP_VARIANT
 #define RUN_SEQ_VARIANT
 
 
-using host_pol = RAJA::seq_exec;
+using host_pol       = RAJA::seq_exec;
 using host_resources = RAJA::resources::Host;
 
 
 #if defined(RAJA_ENABLE_HIP) && defined(RUN_HIP_VARIANT)
-using device_pol = RAJA::hip_exec<256>;
+using device_pol       = RAJA::hip_exec<256>;
 using device_resources = RAJA::resource::Hip;
 
-using kernel_pol = RAJA::KernelPolicy<
-  RAJA::statement::HipKernelFixed<256,
-    RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>,
-      RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>,
-         RAJA::statement::Lambda<0>
-      >
-    >
-   >
-  >;
+using kernel_pol = RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+    256,
+    RAJA::statement::For<
+        1,
+        RAJA::hip_global_size_y_direct<16>,
+        RAJA::statement::For<0,
+                             RAJA::hip_global_size_x_direct<16>,
+                             RAJA::statement::Lambda<0>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA) && defined(RUN_CUDA_VARIANT)
-using device_pol = RAJA::cuda_exec<256>;
+using device_pol       = RAJA::cuda_exec<256>;
 using device_resources = RAJA::resources::Cuda;
 
-using kernel_pol = RAJA::KernelPolicy<
-  RAJA::statement::CudaKernelFixed<256,
-    RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
-      RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
-         RAJA::statement::Lambda<0>
-      >
-    >
-   >
-  >;
+using kernel_pol = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+    256,
+    RAJA::statement::For<
+        1,
+        RAJA::cuda_global_size_y_direct<16>,
+        RAJA::statement::For<0,
+                             RAJA::cuda_global_size_x_direct<16>,
+                             RAJA::statement::Lambda<0>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL) && defined(RUN_SYCL_VARIANT)
-using device_pol = RAJA::sycl_exec<256>;
+using device_pol       = RAJA::sycl_exec<256>;
 using device_resources = RAJA::resources::Sycl;
 
-using kernel_pol = RAJA::KernelPolicy<
-  RAJA::statement::SyclKernel<
-    RAJA::statement::For<1, RAJA::sycl_global_item_1,
-      RAJA::statement::For<0, RAJA::sycl_global_item_2,
-         RAJA::statement::Lambda<0>
-      >
-    >
-   >
-  >;
+using kernel_pol =
+    RAJA::KernelPolicy<RAJA::statement::SyclKernel<RAJA::statement::For<
+        1,
+        RAJA::sycl_global_item_1,
+        RAJA::statement::
+            For<0, RAJA::sycl_global_item_2, RAJA::statement::Lambda<0>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP_VARIANT)
-using device_pol = RAJA::omp_parallel_for_exec;
+using device_pol       = RAJA::omp_parallel_for_exec;
 using device_resources = RAJA::resources::Host;
 
-using kernel_pol = RAJA::KernelPolicy<
-    RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::seq_exec,
-         RAJA::statement::Lambda<0>
-      >
-    >
-  >;
+using kernel_pol = RAJA::KernelPolicy<RAJA::statement::For<
+    1,
+    RAJA::omp_parallel_for_exec,
+    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 #endif
 
 #if defined(RUN_SEQ_VARIANT)
-using device_pol = RAJA::seq_exec;
-using device_resources =  RAJA::resources::Host;
-
-using kernel_pol = RAJA::KernelPolicy<
-    RAJA::statement::For<1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec,
-         RAJA::statement::Lambda<0>
-      >
-    >
-  >;
+using device_pol       = RAJA::seq_exec;
+using device_resources = RAJA::resources::Host;
+
+using kernel_pol = RAJA::KernelPolicy<RAJA::statement::For<
+    1,
+    RAJA::seq_exec,
+    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 #endif
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   const int N = 10000;
   const int K = 17;
 
-  device_resources def_device_res{device_resources::get_default()};
-  host_resources   def_host_res{host_resources::get_default()};
+  device_resources def_device_res {device_resources::get_default()};
+  host_resources def_host_res {host_resources::get_default()};
 
   auto timer = RAJA::Timer();
 
-  //launch to intialize the stream
-  RAJA::forall<device_pol>
-    (RAJA::RangeSegment(0,1), [=] RAJA_HOST_DEVICE (int i) {
-  });
+  // launch to intialize the stream
+  RAJA::forall<device_pol>(RAJA::RangeSegment(0, 1),
+                           [=] RAJA_HOST_DEVICE(int i) {});
 
-  int * array      = def_host_res.allocate<int>(N * N);
-  int * array_copy = def_host_res.allocate<int>(N * N);
+  int* array      = def_host_res.allocate<int>(N * N);
+  int* array_copy = def_host_res.allocate<int>(N * N);
 
-  //big array, or image
-  for (int i = 0; i < N * N; ++i) {
-    array[i] = 1;
+  // big array, or image
+  for (int i = 0; i < N * N; ++i)
+  {
+    array[i]      = 1;
     array_copy[i] = 1;
   }
 
-  //small array that acts as the blur
-  int * kernel  = def_host_res.allocate<int>(K * K);
-  for (int i = 0; i < K * K; ++i) {
+  // small array that acts as the blur
+  int* kernel = def_host_res.allocate<int>(K * K);
+  for (int i = 0; i < K * K; ++i)
+  {
     kernel[i] = 2;
   }
 
@@ -140,7 +130,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   constexpr int DIM = 2;
   RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view(d_array, N, N);
-  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view_copy(d_array_copy, N, N);
+  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view_copy(d_array_copy, N,
+                                                             N);
   RAJA::View<int, RAJA::Layout<DIM, int, 1>> kernel_view(d_kernel, K, K);
 
   RAJA::RangeSegment range_i(0, N);
@@ -148,60 +139,68 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   timer.start();
 
-  RAJA::kernel<kernel_pol>
-    (RAJA::make_tuple(range_i, range_j),
-     [=] RAJA_HOST_DEVICE (int i, int j) {
-      int sum = 0;
-
-      //looping through the "blur"
-      for (int m = 0; m < K; ++m) {
-	for (int n = 0; n < K; ++n) {
-	  int x = i + m;
-	  int y = j + n;
-
-	  // adding the "blur" to the "image" wherever the blur is located on the image
-	  if (x < N && y < N) {
-	    sum += kernel_view(m, n) * array_view(x, y);
-	  }
-	}
-      }
-
-      array_view(i, j) += sum;
-    }
-   );
+  RAJA::kernel<kernel_pol>(RAJA::make_tuple(range_i, range_j),
+                           [=] RAJA_HOST_DEVICE(int i, int j)
+                           {
+                             int sum = 0;
+
+                             // looping through the "blur"
+                             for (int m = 0; m < K; ++m)
+                             {
+                               for (int n = 0; n < K; ++n)
+                               {
+                                 int x = i + m;
+                                 int y = j + n;
+
+                                 // adding the "blur" to the "image" wherever
+                                 // the blur is located on the image
+                                 if (x < N && y < N)
+                                 {
+                                   sum += kernel_view(m, n) * array_view(x, y);
+                                 }
+                               }
+                             }
+
+                             array_view(i, j) += sum;
+                           });
 
   timer.stop();
 
-  std::cout<<"Elapsed time with RAJA view : "<<timer.elapsed()<<std::endl;
+  std::cout << "Elapsed time with RAJA view : " << timer.elapsed() << std::endl;
 
 
- timer.reset();
- timer.start();
-
-  RAJA::kernel<kernel_pol>
-    (RAJA::make_tuple(range_i, range_j),
-     [=] RAJA_HOST_DEVICE (int i, int j) {
-      int sum = 0;
-
-      // looping through the "blur"
-      for (int m = 0; m < K; ++m) {
-	for (int n = 0; n < K; ++n) {
-	  int x = i + m;
-	  int y = j + n;
-
-	  // adding the "blur" to the "image" wherever the blur is located on the image
-	  if (x < N && y < N) {
-	    sum += d_kernel[m * K + n] * d_array_copy[x * N + y];
-	  }
-	}
-      }
+  timer.reset();
+  timer.start();
 
-      d_array_copy[i * N + j] += sum;
-    }
-  );
+  RAJA::kernel<kernel_pol>(RAJA::make_tuple(range_i, range_j),
+                           [=] RAJA_HOST_DEVICE(int i, int j)
+                           {
+                             int sum = 0;
+
+                             // looping through the "blur"
+                             for (int m = 0; m < K; ++m)
+                             {
+                               for (int n = 0; n < K; ++n)
+                               {
+                                 int x = i + m;
+                                 int y = j + n;
+
+                                 // adding the "blur" to the "image" wherever
+                                 // the blur is located on the image
+                                 if (x < N && y < N)
+                                 {
+                                   sum += d_kernel[m * K + n] *
+                                          d_array_copy[x * N + y];
+                                 }
+                               }
+                             }
+
+                             d_array_copy[i * N + j] += sum;
+                           });
   timer.stop();
 
-  std::cout<<"Elapsed time with NO RAJA view : "<<timer.elapsed()<<std::endl;
+  std::cout << "Elapsed time with NO RAJA view : " << timer.elapsed()
+            << std::endl;
 
   def_device_res.memcpy(array, d_array, N * N * sizeof(int));
   def_device_res.memcpy(array_copy, d_array_copy, N * N * sizeof(int));
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index 8a19001cc7..6f62692f08 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -209,7 +209,7 @@ endmacro(raja_add_benchmark)
 ## raja_add_code_checks()
 ##
 ## Adds code checks for all source files recursively in the RAJA repository.
-## 
+##
 ## This creates the following parent build targets:
 ##  check - Runs a non file changing style check and CppCheck
 ##  style - In-place code formatting
@@ -232,7 +232,6 @@ macro(raja_add_code_checks)
   # another project
   if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
       # Create file globbing expressions that only include directories that contain source
-      # TODO(bowen) Add examples, exercises and benchmark to the list below
       set(_base_dirs "RAJA" "examples" "exercises" "benchmark" "include" "src" "test")
       set(_ext_expressions "*.cpp" "*.hpp" "*.inl"
                            "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh")
@@ -248,15 +247,6 @@ macro(raja_add_code_checks)
       set(_sources)
       file(GLOB_RECURSE _sources ${_glob_expressions})
 
-      # Filter out exclusions
-      #set(_exclude_expressions
-      #    "${PROJECT_SOURCE_DIR}/axom/sidre/examples/lulesh2/*"
-      #    "${PROJECT_SOURCE_DIR}/axom/slam/examples/lulesh2.0.3/*"
-      #    "${PROJECT_SOURCE_DIR}/axom/slam/examples/tinyHydro/*")
-      #foreach(_exp ${_exclude_expressions})
-      #    list(FILTER _sources EXCLUDE REGEX ${_exp})
-      #endforeach()
-#
       blt_add_code_checks(PREFIX          RAJA
                           SOURCES         ${_sources}
                           CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format
@@ -265,10 +255,10 @@ macro(raja_add_code_checks)
       # Set FOLDER property for code check targets
       foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style)
           set(_tgt ${arg_PREFIX}_${_suffix})
-          if(TARGET ${_tgt}) 
+          if(TARGET ${_tgt})
               set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks")
           endif()
       endforeach()
   endif()
 
-endmacro(raja_add_code_checks)
+endmacro(raja_add_code_checks)
\ No newline at end of file