From 506996a65a102cdf77a96d407a701c7602e88929 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 23 Aug 2024 17:00:45 -0700
Subject: [PATCH 01/15] Refactor some hip policies to avoid duplicated code

Use a macro to generate the various aliases
---
 include/RAJA/policy/hip/policy.hpp | 1079 ++++++----------------------
 1 file changed, 233 insertions(+), 846 deletions(-)

diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 040de50f31..a1adae1488 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1516,890 +1516,277 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
     kernel_sync_requirement::none,
     indexers...>;
 
-/*!
- * Maps segment indices to HIP threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical threads to fit all of the direct map requests.
- * For example, a segment of size 2000 will not fit, and trigger a runtime
- * error.
- */
-template < named_dim ... dims >
-using hip_thread_direct = hip_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-using hip_thread_x_direct = hip_thread_direct<named_dim::x>;
-using hip_thread_y_direct = hip_thread_direct<named_dim::y>;
-using hip_thread_z_direct = hip_thread_direct<named_dim::z>;
-
-using hip_thread_xy_direct = hip_thread_direct<named_dim::x, named_dim::y>;
-using hip_thread_xz_direct = hip_thread_direct<named_dim::x, named_dim::z>;
-using hip_thread_yx_direct = hip_thread_direct<named_dim::y, named_dim::x>;
-using hip_thread_yz_direct = hip_thread_direct<named_dim::y, named_dim::z>;
-using hip_thread_zx_direct = hip_thread_direct<named_dim::z, named_dim::x>;
-using hip_thread_zy_direct = hip_thread_direct<named_dim::z, named_dim::y>;
-
-using hip_thread_xyz_direct = hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_direct = hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_direct = hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_direct = hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_direct = hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_direct = hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
-/*!
- * Maps segment indices to HIP threads.
- * Uses block-stride looping to exceed the maximum number of physical threads
- */
-template < named_dim ... dims >
-using hip_thread_loop = hip_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-template < named_dim ... dims >
-using hip_thread_syncable_loop = hip_indexer_syncable_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-using hip_thread_x_loop = hip_thread_loop<named_dim::x>;
-using hip_thread_y_loop = hip_thread_loop<named_dim::y>;
-using hip_thread_z_loop = hip_thread_loop<named_dim::z>;
-
-using hip_thread_xy_loop = hip_thread_loop<named_dim::x, named_dim::y>;
-using hip_thread_xz_loop = hip_thread_loop<named_dim::x, named_dim::z>;
-using hip_thread_yx_loop = hip_thread_loop<named_dim::y, named_dim::x>;
-using hip_thread_yz_loop = hip_thread_loop<named_dim::y, named_dim::z>;
-using hip_thread_zx_loop = hip_thread_loop<named_dim::z, named_dim::x>;
-using hip_thread_zy_loop = hip_thread_loop<named_dim::z, named_dim::y>;
-
-using hip_thread_xyz_loop = hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_loop = hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_loop = hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_loop = hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_loop = hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+// helper to generate the many policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, scope, mapping) \
+  using hip_##flatten##scope##_x_##mapping = hip_##flatten##scope##_##mapping<named_dim::x>; \
+  using hip_##flatten##scope##_y_##mapping = hip_##flatten##scope##_##mapping<named_dim::y>; \
+  using hip_##flatten##scope##_z_##mapping = hip_##flatten##scope##_##mapping<named_dim::z>; \
+  \
+  using hip_##flatten##scope##_xy_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::y>; \
+  using hip_##flatten##scope##_xz_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::z>; \
+  using hip_##flatten##scope##_yx_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::x>; \
+  using hip_##flatten##scope##_yz_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::z>; \
+  using hip_##flatten##scope##_zx_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::x>; \
+  using hip_##flatten##scope##_zy_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::y>; \
+  \
+  using hip_##flatten##scope##_xyz_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::y, named_dim::z>; \
+  using hip_##flatten##scope##_xzy_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::z, named_dim::y>; \
+  using hip_##flatten##scope##_yxz_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::x, named_dim::z>; \
+  using hip_##flatten##scope##_yzx_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::z, named_dim::x>; \
+  using hip_##flatten##scope##_zxy_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::x, named_dim::y>; \
+  using hip_##flatten##scope##_zyx_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::y, named_dim::x>;
+
+// helper to generate the many thread policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten, mapping) \
+  template < named_dim ... dims > \
+  using hip_##flatten##thread_##mapping = hip_##flatten##indexer_##mapping< \
+      hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>; \
+  RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, thread, mapping)
+
+// helper to generate the many block policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten, mapping) \
+  template < named_dim ... dims > \
+  using hip_##flatten##block_##mapping = hip_##flatten##indexer_##mapping< \
+      hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>; \
+  RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, block, mapping)
+
+// helper to generate the many global policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten, mapping) \
+  template < named_dim ... dims > \
+  using hip_##flatten##global_##mapping = hip_##flatten##indexer_##mapping< \
+      hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>; \
+  RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, global, mapping)
 
-/*
- * Maps segment indices to flattened HIP threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical threads to fit all of the direct map requests.
- * Reshapes multiple physical threads into a 1D iteration space
- */
-template < named_dim ... dims >
-using hip_flatten_thread_direct = hip_flatten_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-using hip_flatten_thread_x_direct = hip_flatten_thread_direct<named_dim::x>;
-using hip_flatten_thread_y_direct = hip_flatten_thread_direct<named_dim::y>;
-using hip_flatten_thread_z_direct = hip_flatten_thread_direct<named_dim::z>;
-
-using hip_flatten_thread_xy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
-/*
- * Maps segment indices to flattened HIP threads.
- * Reshapes multiple physical threads into a 1D iteration space
- * Uses block-stride looping to exceed the maximum number of physical threads
+/*!
+ * Maps segment indices to HIP threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
+ * physical threads, blocks, or global threads to fit all of the direct map requests.
+ * For example, a segment of size 2000 will not fit into 1024 threads, blocks,
+ * or global threads, and triggers a runtime error in some cases.
  */
-template < named_dim ... dims >
-using hip_flatten_thread_loop = hip_flatten_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-using hip_flatten_thread_x_loop = hip_flatten_thread_loop<named_dim::x>;
-using hip_flatten_thread_y_loop = hip_flatten_thread_loop<named_dim::y>;
-using hip_flatten_thread_z_loop = hip_flatten_thread_loop<named_dim::z>;
-
-using hip_flatten_thread_xy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, direct)
 
-using hip_flatten_thread_xyz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(, direct)
 
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, direct)
 
 /*!
- * Maps segment indices to HIP blocks.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical blocks to fit all of the direct map requests.
+ * Maps segment indices to HIP threads, blocks, or global threads.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads.
  */
-template < named_dim ... dims >
-using hip_block_direct = hip_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-using hip_block_x_direct = hip_block_direct<named_dim::x>;
-using hip_block_y_direct = hip_block_direct<named_dim::y>;
-using hip_block_z_direct = hip_block_direct<named_dim::z>;
-
-using hip_block_xy_direct = hip_block_direct<named_dim::x, named_dim::y>;
-using hip_block_xz_direct = hip_block_direct<named_dim::x, named_dim::z>;
-using hip_block_yx_direct = hip_block_direct<named_dim::y, named_dim::x>;
-using hip_block_yz_direct = hip_block_direct<named_dim::y, named_dim::z>;
-using hip_block_zx_direct = hip_block_direct<named_dim::z, named_dim::x>;
-using hip_block_zy_direct = hip_block_direct<named_dim::z, named_dim::y>;
-
-using hip_block_xyz_direct = hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_direct = hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_direct = hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_direct = hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_direct = hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_direct = hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(, loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, loop)
 
 /*!
- * Maps segment indices to HIP blocks.
- * Uses grid-stride looping to exceed the maximum number of blocks
+ * Only used in the "kernel" abstraction.
+ * Maps segment indices to HIP threads, blocks, or global threads.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads.
+ * Allow synchronization in the loop, do not mask any threads out.
  */
-template < named_dim ... dims >
-using hip_block_loop = hip_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using hip_block_syncable_loop = hip_indexer_syncable_loop<
-    hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-using hip_block_x_loop = hip_block_loop<named_dim::x>;
-using hip_block_y_loop = hip_block_loop<named_dim::y>;
-using hip_block_z_loop = hip_block_loop<named_dim::z>;
-
-using hip_block_xy_loop = hip_block_loop<named_dim::x, named_dim::y>;
-using hip_block_xz_loop = hip_block_loop<named_dim::x, named_dim::z>;
-using hip_block_yx_loop = hip_block_loop<named_dim::y, named_dim::x>;
-using hip_block_yz_loop = hip_block_loop<named_dim::y, named_dim::z>;
-using hip_block_zx_loop = hip_block_loop<named_dim::z, named_dim::x>;
-using hip_block_zy_loop = hip_block_loop<named_dim::z, named_dim::y>;
-
-using hip_block_xyz_loop = hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_loop = hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_loop = hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_loop = hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_loop = hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, syncable_loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(, syncable_loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, syncable_loop)
 
-/*
- * Maps segment indices to flattened HIP blocks.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical blocks to fit all of the direct map requests.
- * Reshapes multiple physical blocks into a 1D iteration space
- */
-template < named_dim ... dims >
-using hip_flatten_block_direct = hip_flatten_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-using hip_flatten_block_x_direct = hip_flatten_block_direct<named_dim::x>;
-using hip_flatten_block_y_direct = hip_flatten_block_direct<named_dim::y>;
-using hip_flatten_block_z_direct = hip_flatten_block_direct<named_dim::z>;
-
-using hip_flatten_block_xy_direct = hip_flatten_block_direct<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_direct = hip_flatten_block_direct<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_direct = hip_flatten_block_direct<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_direct = hip_flatten_block_direct<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_direct = hip_flatten_block_direct<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_direct = hip_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_direct = hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_direct = hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_direct = hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_direct = hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_direct = hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_direct = hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
- * Maps segment indices to flattened HIP blocks.
- * Reshapes multiple physical blocks into a 1D iteration space
- * Uses block-stride looping to exceed the maximum number of physical blocks
+ * Maps segment indices to flattened HIP threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
+ * physical threads, blocks, or global threads to fit all of the direct map
+ * requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space
  */
-template < named_dim ... dims >
-using hip_flatten_block_loop = hip_flatten_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten_, direct)
 
-using hip_flatten_block_x_loop = hip_flatten_block_loop<named_dim::x>;
-using hip_flatten_block_y_loop = hip_flatten_block_loop<named_dim::y>;
-using hip_flatten_block_z_loop = hip_flatten_block_loop<named_dim::z>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, direct)
 
-using hip_flatten_block_xy_loop = hip_flatten_block_loop<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_loop = hip_flatten_block_loop<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_loop = hip_flatten_block_loop<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_loop = hip_flatten_block_loop<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_loop = hip_flatten_block_loop<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_loop = hip_flatten_block_loop<named_dim::z, named_dim::y>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, direct)
 
-using hip_flatten_block_xyz_loop = hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_loop = hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_loop = hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_loop = hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_loop = hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+/*
+ * Maps segment indices to flattened HIP threads, blocks, or global threads.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads
+ */
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, loop)
+
+
+// helper to generate the many one size policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
+  template < int X_SIZE > \
+  using hip_##flatten##scope##_size_x_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>>; \
+  template < int Y_SIZE > \
+  using hip_##flatten##scope##_size_y_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>>; \
+  template < int Z_SIZE > \
+  using hip_##flatten##scope##_size_z_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>>; \
+  \
+  template < int X_SIZE, int Y_SIZE > \
+  using hip_##flatten##scope##_size_xy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_y<Y_SIZE>>; \
+  template < int X_SIZE, int Z_SIZE > \
+  using hip_##flatten##scope##_size_xz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_z<Z_SIZE>>; \
+  template < int Y_SIZE, int X_SIZE > \
+  using hip_##flatten##scope##_size_yx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_x<X_SIZE>>; \
+  template < int Y_SIZE, int Z_SIZE > \
+  using hip_##flatten##scope##_size_yz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_z<Z_SIZE>>; \
+  template < int Z_SIZE, int X_SIZE > \
+  using hip_##flatten##scope##_size_zx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_x<X_SIZE>>; \
+  template < int Z_SIZE, int Y_SIZE > \
+  using hip_##flatten##scope##_size_zy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_y<Y_SIZE>>; \
+  \
+  template < int X_SIZE, int Y_SIZE, int Z_SIZE > \
+  using hip_##flatten##scope##_size_xyz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_y<Y_SIZE>, hip::scope##_z<Z_SIZE>>; \
+  template < int X_SIZE, int Z_SIZE, int Y_SIZE > \
+  using hip_##flatten##scope##_size_xzy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_z<Z_SIZE>, hip::scope##_y<Y_SIZE>>; \
+  template < int Y_SIZE, int X_SIZE, int Z_SIZE > \
+  using hip_##flatten##scope##_size_yxz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_x<X_SIZE>, hip::scope##_z<Z_SIZE>>; \
+  template < int Y_SIZE, int Z_SIZE, int X_SIZE > \
+  using hip_##flatten##scope##_size_yzx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_z<Z_SIZE>, hip::scope##_x<X_SIZE>>; \
+  template < int Z_SIZE, int X_SIZE, int Y_SIZE > \
+  using hip_##flatten##scope##_size_zxy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_x<X_SIZE>, hip::scope##_y<Y_SIZE>>; \
+  template < int Z_SIZE, int Y_SIZE, int X_SIZE > \
+  using hip_##flatten##scope##_size_zyx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_y<Y_SIZE>, hip::scope##_x<X_SIZE>>;
+
+// helper to generate the many two size policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
+  template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_x_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_y_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_z_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  \
+  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_xy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                       hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_xz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                       hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_yx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                       hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_yz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                       hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_zx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                       hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_zy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                       hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  \
+  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_xyz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_xzy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_yxz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_yzx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_zxy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using hip_##flatten##scope##_size_zyx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+
+// helper to generate the many thread size policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten, mapping) \
+    RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, thread, mapping)
+
+// helper to generate the many block size policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten, mapping) \
+    RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, block, mapping)
+
+// helper to generate the many global size policy aliases
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten, mapping) \
+    RAJA_INTERNAL_HIP_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, global, mapping)
 
 
 /*!
- * Maps segment indices to HIP global threads.
- * This is the lowest overhead mapping, but requires that there are enough
+ * Maps segment indices to HIP threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using hip_global_direct = hip_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-using hip_global_x_direct = hip_global_direct<named_dim::x>;
-using hip_global_y_direct = hip_global_direct<named_dim::y>;
-using hip_global_z_direct = hip_global_direct<named_dim::z>;
-
-using hip_global_xy_direct = hip_global_direct<named_dim::x, named_dim::y>;
-using hip_global_xz_direct = hip_global_direct<named_dim::x, named_dim::z>;
-using hip_global_yx_direct = hip_global_direct<named_dim::y, named_dim::x>;
-using hip_global_yz_direct = hip_global_direct<named_dim::y, named_dim::z>;
-using hip_global_zx_direct = hip_global_direct<named_dim::z, named_dim::x>;
-using hip_global_zy_direct = hip_global_direct<named_dim::z, named_dim::y>;
-
-using hip_global_xyz_direct = hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_direct = hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_direct = hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_direct = hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_direct = hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_direct = hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, direct)
 
-/*!
- * Maps segment indices to HIP global threads.
- * Uses grid-stride looping to exceed the maximum number of global threads
- */
-template < named_dim ... dims >
-using hip_global_loop = hip_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using hip_global_syncable_loop = hip_indexer_syncable_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-using hip_global_x_loop = hip_global_loop<named_dim::x>;
-using hip_global_y_loop = hip_global_loop<named_dim::y>;
-using hip_global_z_loop = hip_global_loop<named_dim::z>;
-
-using hip_global_xy_loop = hip_global_loop<named_dim::x, named_dim::y>;
-using hip_global_xz_loop = hip_global_loop<named_dim::x, named_dim::z>;
-using hip_global_yx_loop = hip_global_loop<named_dim::y, named_dim::x>;
-using hip_global_yz_loop = hip_global_loop<named_dim::y, named_dim::z>;
-using hip_global_zx_loop = hip_global_loop<named_dim::z, named_dim::x>;
-using hip_global_zy_loop = hip_global_loop<named_dim::z, named_dim::y>;
-
-using hip_global_xyz_loop = hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_loop = hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_loop = hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_loop = hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_loop = hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, direct)
 
-/*
- * Maps segment indices to flattened HIP global threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical global threads to fit all of the direct map requests.
- * Reshapes multiple physical global threads into a 1D iteration space
- */
-template < named_dim ... dims >
-using hip_flatten_global_direct = hip_flatten_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
-using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
-using hip_flatten_global_z_direct = hip_flatten_global_direct<named_dim::z>;
-
-using hip_flatten_global_xy_direct = hip_flatten_global_direct<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_direct = hip_flatten_global_direct<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_direct = hip_flatten_global_direct<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_direct = hip_flatten_global_direct<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_direct = hip_flatten_global_direct<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_direct = hip_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_direct = hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_direct = hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_direct = hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_direct = hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_direct = hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_direct = hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, direct)
 
-/*
- * Maps segment indices to flattened HIP global threads.
- * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+/*!
+ * Maps segment indices to HIP threads, blocks, or global threads.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * threads, blocks, or global threads.
  */
-template < named_dim ... dims >
-using hip_flatten_global_loop = hip_flatten_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
-using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
-using hip_flatten_global_z_loop = hip_flatten_global_loop<named_dim::z>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, loop)
 
-using hip_flatten_global_xy_loop = hip_flatten_global_loop<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_loop = hip_flatten_global_loop<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_loop = hip_flatten_global_loop<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_loop = hip_flatten_global_loop<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_loop = hip_flatten_global_loop<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_loop = hip_flatten_global_loop<named_dim::z, named_dim::y>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, loop)
 
-using hip_flatten_global_xyz_loop = hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_loop = hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_loop = hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_loop = hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_loop = hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, loop)
 
 
-/*!
- * Maps segment indices to HIP global threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical threads to fit all of the direct map requests.
+/*
+ * Maps segment indices to flattened HIP threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
+ * physical threads, blocks, or global threads to fit all of the direct map
+ * requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space.
  */
-template < int X_BLOCK_SIZE >
-using hip_thread_size_x_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_thread_size_y_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_thread_size_z_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_block_size_x_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_block_size_y_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_block_size_z_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, direct)
 
-/*!
- * Maps segment indices to HIP global threads.
- * Uses grid-stride looping to exceed the maximum number of global threads
- */
-template < int X_BLOCK_SIZE >
-using hip_thread_size_x_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_thread_size_y_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_thread_size_z_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_block_size_x_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_block_size_y_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_block_size_z_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, direct)
 
-/*
- * Maps segment indices to flattened HIP global threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical global threads to fit all of the direct map requests.
- * Reshapes multiple physical global threads into a 1D iteration space
- */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, direct)
 
 /*
- * Maps segment indices to flattened HIP global threads.
- * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Maps segment indices to flattened HIP threads, blocks, or global threads.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads.
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, loop)
 
 
 /*

From f08fddc5dac379a520dea9a15f5790d6da0cef46 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 23 Aug 2024 17:03:33 -0700
Subject: [PATCH 02/15] Add an unchecked iteration_mapping

This iteration mapping assumes that the number of
iterations is the same as the size of the range
and does no checking.
This is useful when mapping gpu blocks as we often
launch the exact number we need and don't need to check
if we are in range. This can give ~5% speedup vs direct
in this case.
---
 include/RAJA/util/types.hpp | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 310217bde5..53f30fe4cb 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -70,6 +70,7 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
+struct UncheckedBase {};
 struct DirectBase {};
 struct LoopBase {};
 struct ContiguousLoopBase : LoopBase {};
@@ -82,6 +83,24 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
   static constexpr size_t max_iterations = t_max_iterations;
 };
 
+///
+/// Unchecked assumes the loop has the same number of iterations and indices and
+/// maps directly without bounds checking from an iteration to an index.
+///
+/// For example a loop with 4 iterations mapping indices from a range of size 4.
+///   int iterations = 4;
+///   int range_size = 4;
+///   for (int i = 0; i < iterations; ++i) {
+///     int index = i;
+///     printf("%i -> {%i}", i, index);
+///   }
+///   // 0 -> {0}
+///   // 1 -> {1}
+///   // 2 -> {2}
+///   // 3 -> {3}
+///
+struct Unchecked : UncheckedBase {};
+
 ///
 /// Direct assumes the loop has enough iterations for all of the indices and
 /// maps directly from an iteration to an index.
@@ -94,14 +113,14 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///       int index = i;
 ///       printf("%i -> {%i}", i, index);
 ///     } else {
-///       printf("%i -> {}", i);
+///       printf("%i -> {safely-ignored}", i);
 ///     }
 ///   }
 ///   // 0 -> {0}
 ///   // 1 -> {1}
 ///   // 2 -> {2}
 ///   // 3 -> {3}
-///   // 4 -> {}
+///   // 4 -> {safely-ignored}
 ///
 struct Direct : DirectBase {};
 

From dafde16d62a05370bb917f549bfa894b7d4879ee Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 23 Aug 2024 17:03:58 -0700
Subject: [PATCH 03/15] Add hip unchecked policies

---
 include/RAJA/policy/hip/policy.hpp | 62 ++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index a1adae1488..a78b5838c0 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1486,6 +1486,12 @@ using policy::hip::hip_launch_t;
 
 
 // policies usable with kernel and launch
+template < typename ... indexers >
+using hip_indexer_unchecked = policy::hip::hip_indexer<
+    iteration_mapping::Unchecked,
+    kernel_sync_requirement::none,
+    indexers...>;
+
 template < typename ... indexers >
 using hip_indexer_direct = policy::hip::hip_indexer<
     iteration_mapping::Direct,
@@ -1504,6 +1510,12 @@ using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     kernel_sync_requirement::sync,
     indexers...>;
 
+template < typename ... indexers >
+using hip_flatten_indexer_unchecked = policy::hip::hip_flatten_indexer<
+    iteration_mapping::Unchecked,
+    kernel_sync_requirement::none,
+    indexers...>;
+
 template < typename ... indexers >
 using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
     iteration_mapping::Direct,
@@ -1559,6 +1571,19 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
   RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, global, mapping)
 
 
+/*!
+ * Maps segment indices to HIP threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads, blocks, or global threads as map requests.
+ * For example, a segment of size 1000 will only fit into 1000 threads, blocks, or global threads, and
+ * triggers a runtime error in some cases.
+ */
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, unchecked)
+
 /*!
  * Maps segment indices to HIP threads, blocks, or global threads.
  * This is a low overhead mapping, but requires that there are enough
@@ -1597,6 +1622,19 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(, syncable_loop)
 RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, syncable_loop)
 
 
+/*
+ * Maps segment indices to flattened HIP threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads, blocks, or global threads as map requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space
+ */
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten_, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, unchecked)
+
 /*
  * Maps segment indices to flattened HIP threads, blocks, or global threads.
  * This is a low overhead mapping, but requires that there are enough
@@ -1738,6 +1776,17 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, loop)
     RAJA_INTERNAL_HIP_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, global, mapping)
 
 
+/*!
+ * Maps segment indices to HIP threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads as the map requests.
+ */
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, unchecked)
+
 /*!
  * Maps segment indices to HIP threads, blocks, or global threads.
  * This is a low overhead mapping, but requires that there are enough
@@ -1761,6 +1810,19 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, loop)
 RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, loop)
 
 
+/*
+ * Maps segment indices to flattened HIP threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads, blocks, or global threads as the map requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space.
+ */
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, unchecked)
+
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, unchecked)
+
 /*
  * Maps segment indices to flattened HIP threads, blocks, or global threads.
  * This is a low overhead mapping, but requires that there are enough

From b810592d1de74a6efc9dbca403bc8d2dbf450291 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 23 Aug 2024 17:04:38 -0700
Subject: [PATCH 04/15] Add hip launch unchecked implementations

---
 include/RAJA/policy/hip/launch.hpp | 261 +++++++++++++++++++++++++++++
 1 file changed, 261 insertions(+)

diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 6823647b48..e1cbd87497 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -358,6 +358,77 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 /*
    HIP generic loop implementations
 */
+template <typename SEGMENT, typename IndexMapper>
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE
+  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+            SEGMENT const &segment,
+            BODY const &body)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    body(*(segment.begin() + i));
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+
+    body(*(segment0.begin() + i0), *(segment1.begin() + i1));
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+    const diff_t i2 = IndexMapper2::template index<diff_t>();
+
+    body(*(segment0.begin() + i0),
+         *(segment1.begin() + i1),
+         *(segment2.begin() + i2));
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
@@ -548,6 +619,83 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
   }
 };
 
+
+/*
+   HIP generic loop_icount implementations
+*/
+template <typename SEGMENT, typename IndexMapper>
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    body(*(segment.begin() + i), i);
+  }
+};
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+
+    body(*(segment0.begin() + i0),
+         *(segment1.begin() + i1),
+         i0, i1);
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+    const diff_t i2 = IndexMapper2::template index<diff_t>();
+
+    body(*(segment0.begin() + i0),
+         *(segment1.begin() + i1),
+         *(segment2.begin() + i2),
+         i0, i1, i2);
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
@@ -746,6 +894,72 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 /*
    HIP generic flattened loop implementations
 */
+template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+                                                          sync,
+                                                          IndexMapper0>,
+                   SEGMENT>
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  sync,
+                                                  IndexMapper0>,
+                   SEGMENT>
+{};
+
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1>,
+                   SEGMENT>
+{
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int i0 = IndexMapper0::template index<diff_t>();
+    const int i1 = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>();
+
+    const int i = i0 + i0_stride*i1;
+
+    body(*(segment.begin() + i));
+  }
+};
+
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1,
+                                                          IndexMapper2>,
+                   SEGMENT>
+{
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int i0 = IndexMapper0::template index<diff_t>();
+    const int i1 = IndexMapper1::template index<diff_t>();
+    const int i2 = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>();
+    const diff_t i1_stride = IndexMapper1::template size<diff_t>();
+
+    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+
+    body(*(segment.begin() + i));
+  }
+};
+
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
 struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           sync,
@@ -900,6 +1114,27 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 /*
    HIP generic tile implementations
 */
+template <typename SEGMENT, typename IndexMapper>
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+
+    body(segment.slice(i, static_cast<diff_t>(tile_size)));
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
@@ -949,6 +1184,32 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
   }
 };
 
+
+/*
+   HIP generic tile_tcount implementations
+*/
+template <typename SEGMENT, typename IndexMapper>
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(tile_size);
+
+    body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,

From 2a315f400b8c309db7fcf0d46dc44ecc11bee0a8 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 25 Aug 2024 15:22:53 -0700
Subject: [PATCH 05/15] Add support for hip unchecked in kernel

---
 include/RAJA/policy/hip/kernel/For.hpp        |  59 ++++++
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  52 +++++
 include/RAJA/policy/hip/kernel/Tile.hpp       |  84 ++++++++
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  67 +++++++
 include/RAJA/policy/hip/kernel/internal.hpp   | 179 ++++++++++++++++++
 5 files changed, 441 insertions(+)

diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 39e7104c16..1f181d4590 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -30,6 +30,65 @@ namespace RAJA
 namespace internal
 {
 
+/*
+ * Executor for work sharing inside HipKernel.
+ * Mapping without checking from IndexMapper to indices
+ * Assigns the loop index to offset ArgumentId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::For<ArgumentId,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                   EnclosedStmts...>,
+    Types> {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
+  using enclosed_stmts_t =
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
+  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
+      RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    // Assign the index to the argument
+    data.template assign_offset<ArgumentId>(i);
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+  }
+
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+    const diff_t len = segment_length<ArgumentId>(data);
+
+    HipDims my_dims(0), my_min_dims(0);
+    DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
+
+    // combine with enclosed statements
+    LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
+    return dims.max(enclosed_dims);
+  }
+};
+
 /*
  * Executor for work sharing inside HipKernel.
  * Mapping directly from IndexMapper to indices
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index ba6642f248..30d143c6cf 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -30,6 +30,58 @@ namespace RAJA
 namespace internal
 {
 
+/*
+ * Executor for work sharing inside HipKernel.
+ * Provides an unchecked mapping.
+ * Assigns the loop index to offset ArgumentId
+ * Assigns the loop index to param ParamId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                         EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
+
+  using Base = HipStatementExecutor<
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                     EnclosedStmts...>,
+      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // grid stride loop
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    // Assign the index to the argument and param
+    data.template assign_offset<ArgumentId>(i);
+    data.template assign_param<ParamId>(i);
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+  }
+};
+
 /*
  * Executor for work sharing inside HipKernel.
  * Provides a direct mapping.
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 62dda7f20d..90c147329c 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -42,6 +42,90 @@ namespace RAJA
 namespace internal
 {
 
+/*!
+ * A specialized RAJA::kernel hip_impl executor for statement::Tile
+ * Assigns the tile segment to segment ArgumentId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
+  {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+
+    // Keep copy of original segment, so we can restore it
+    segment_t orig_segment = segment;
+
+    // Assign our new tiled segment
+    segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+    // Compute how many chunks
+    const diff_t full_len = segment_length<ArgumentId>(data);
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+
+    HipDims my_dims(0), my_min_dims(0);
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
+
+    // privatize data, so we can mess with the segments
+    using data_t = camp::decay<Data>;
+    data_t private_data = data;
+
+    // Get original segment
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+
+    // restrict to first tile
+    segment = segment.slice(0, static_cast<diff_t>(chunk_size));
+
+    // NOTE: We do not detect improper uses of unchecked policies under tiling.
+    // This happens when using an unchecked policy on a tiled range that is not
+    // evenly divisible by chunk_size.
+    LaunchDims enclosed_dims =
+        enclosed_stmts_t::calculateDimensions(private_data);
+
+    return dims.max(enclosed_dims);
+  }
+};
+
 /*!
  * A specialized RAJA::kernel hip_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 07637fbd8f..6975c5a083 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -42,6 +42,73 @@ namespace RAJA
 namespace internal
 {
 
+/*!
+ * A specialized RAJA::kernel hip_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
+    : public HipStatementExecutor<
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
+
+  using Base = HipStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                      EnclosedStmts...>,
+                      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(chunk_size);
+
+    // Keep copy of original segment, so we can restore it
+    segment_t orig_segment = segment;
+
+    // Assign our new tiled segment
+    segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
+    data.template assign_param<ParamId>(t);
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
 /*!
  * A specialized RAJA::kernel hip_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index aa0610d736..c518d67f1f 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -215,6 +215,185 @@ using hip_statement_list_executor_t = HipStatementListExecutor<
 template<typename kernel_indexer>
 struct KernelDimensionCalculator;
 
+// specialization for unchecked sequential policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+{
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  {
+    if ( len != static_cast<IdxT>(1) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+  }
+};
+
+// specialization for unchecked thread policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+{
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  {
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
+  }
+};
+///
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+{
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  {
+    if ( len != static_cast<IdxT>(IndexMapper::block_size) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+  }
+};
+
+// specialization for unchecked block policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+{
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  {
+    set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(len));
+  }
+};
+///
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+{
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  {
+    if ( len != static_cast<IdxT>(IndexMapper::grid_size) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+  }
+};
+
+// specialization for unchecked global policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+{
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  {
+    if (len != static_cast<IdxT>(0)) {
+      RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
+    }
+  }
+};
+///
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+{
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  {
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    const IdxT block_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
+    if ( len != (block_size * static_cast<IdxT>(IndexMapper::grid_size)) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_hip_dim<dim>(dims.threads, block_size);
+    set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads, block_size);
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+  }
+};
+///
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+{
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  {
+    const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size));
+    if ( len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks, grid_size);
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks, grid_size);
+  }
+};
+///
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+{
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  {
+    if ( len != (static_cast<IdxT>(IndexMapper::block_size) *
+                 static_cast<IdxT>(IndexMapper::grid_size)) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+  }
+};
+
+
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,

From a4741c7c532901e5326d6e41bde4b96b13e46010 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 25 Aug 2024 15:23:10 -0700
Subject: [PATCH 06/15] Add hip warp unchecked policy

---
 include/RAJA/policy/hip/policy.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index a78b5838c0..9872bea553 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1463,6 +1463,10 @@ using hip_multi_reduce_atomic_low_performance_low_overhead =
 using policy::hip::hip_block_reduce;
 using policy::hip::hip_warp_reduce;
 
+using hip_warp_unchecked = RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Unchecked,
+    kernel_sync_requirement::none,
+    hip::thread_x<RAJA::policy::hip::device_constants.WARP_SIZE>>;
 using hip_warp_direct = RAJA::policy::hip::hip_indexer<
     iteration_mapping::Direct,
     kernel_sync_requirement::none,

From 31f0744267aa336f24f7246fa21369ee392b97f8 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 25 Aug 2024 15:23:25 -0700
Subject: [PATCH 07/15] Add unchecked policies to docs

---
 docs/sphinx/user_guide/feature/policies.rst | 190 ++++++++++++++++----
 1 file changed, 155 insertions(+), 35 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 768e32674b..8ba998f012 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -309,11 +309,22 @@ policies have the prefix ``hip_``.
 |                                                    |               | expression is executed          |
 |                                                    |               | on the device.                  |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_thread_x_direct                           | kernel (For)  | Map loop iterates directly to   |
+| cuda/hip_thread_x_unchecked                        | kernel (For)  | Map loop iterates unchecked to  |
 |                                                    | launch (loop) | GPU threads in x-dimension, one |
 |                                                    |               | iterate per thread. See note    |
 |                                                    |               | below about limitations.        |
 +----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_y_unchecked                        | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in y-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_z_unchecked                        | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in z-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_x_direct                           | kernel (For)  | Map loop iterates directly to   |
+|                                                    | launch (loop) | GPU threads in x-dimension, one |
+|                                                    |               | or no iterates per thread. See  |
+|                                                    |               | note below about limitations.   |
++----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_thread_y_direct                           | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to threads in y-dimension.      |
 +----------------------------------------------------+---------------+---------------------------------+
@@ -335,6 +346,17 @@ policies have the prefix ``hip_``.
 |                                                    | launch (loop) | policy, but safe to use         |
 |                                                    |               | with Cuda/HipSyncThreads.       |
 +----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_x_unchecked<nx_threads>       | kernel (For)  | Same as thread_x_unchecked      |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time number of        |
+|                                                    |               | threads.                        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_y_unchecked<ny_threads>       | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in y-dimension       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_z_unchecked<nz_threads>       | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in z-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_thread_size_x_direct<nx_threads>          | kernel (For)  | Same as thread_x_direct         |
 |                                                    | launch (loop) | policy above but with           |
 |                                                    |               | a compile time number of        |
@@ -346,16 +368,34 @@ policies have the prefix ``hip_``.
 | cuda/hip_thread_size_z_direct<nz_threads>          | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to threads in z-dimension.      |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_flatten_threads_{xyz}_direct              | launch (loop) | Reshapes threads in a           |
+| cuda/hip_flatten_threads_{xyz}_unchecked           | launch (loop) | Reshapes threads in a           |
 |                                                    |               | multi-dimensional thread        |
-|                                                    |               | team into one-dimension,        |
-|                                                    |               | accepts any permutation         |
-|                                                    |               | of dimensions                   |
+|                                                    |               | team into one-dimension.        |
+|                                                    |               | Accepts any permutation         |
+|                                                    |               | of one, two, or three           |
+|                                                    |               | dimensions.                     |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_flatten_threads_{xyz}_direct              | launch (loop) | Same as above, but with direct  |
+|                                                    |               | mapping.                        |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_x_direct                            | kernel (For)  | Map loop iterates               |
-|                                                    | launch (loop) | directly to GPU thread          |
-|                                                    |               | blocks in x-dimension,          |
-|                                                    |               | one iterate per block           |
+| cuda/hip_flatten_threads_{xyz}_loop                | launch (loop) | Same as above, but with loop    |
+|                                                    |               | mapping.                        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_x_unchecked                         | kernel (For)  | Map loop iterates unchecked     |
+|                                                    | launch (loop) | to GPU thread blocks in the     |
+|                                                    |               | x-dimension, one iterate per    |
+|                                                    |               | block.                          |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_y_unchecked                         | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in y-dimension        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_z_unchecked                         | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in z-dimension        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_x_direct                            | kernel (For)  | Map loop iterates directly to   |
+|                                                    | launch (loop) | GPU thread blocks in the        |
+|                                                    |               | x-dimension, one or no iterates |
+|                                                    |               | per block.                      |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_block_y_direct                            | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in y-dimension        |
@@ -363,9 +403,8 @@ policies have the prefix ``hip_``.
 | cuda/hip_block_z_direct                            | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in z-dimension        |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_x_loop                              | kernel (For)  | Similar to                      |
-|                                                    | launch (loop) | block-x-direct policy,          |
-|                                                    |               | but use a grid-stride           |
+| cuda/hip_block_x_loop                              | kernel (For)  | Similar to block-x-direct       |
+|                                                    | launch (loop) | policy, but use a grid-stride   |
 |                                                    |               | loop.                           |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_block_y_loop                              | kernel (For)  | Same as above, but use          |
@@ -374,10 +413,19 @@ policies have the prefix ``hip_``.
 | cuda/hip_block_z_loop                              | kernel (For)  | Same as above, but use          |
 |                                                    | launch (loop) | blocks in z-dimension           |
 +----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_x_unchecked<nx_blocks>         | kernel (For)  | Same as block_x_unchecked       |
+|                                                    | launch (loop) | policy above but with a         |
+|                                                    |               | compile time number of blocks   |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_y_unchecked<ny_blocks>         | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in y-dim              |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_z_unchecked<nz_blocks>         | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in z-dim              |
++----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_block_size_x_direct<nx_blocks>            | kernel (For)  | Same as block_x_direct          |
-|                                                    | launch (loop) | policy above but with           |
-|                                                    |               | a compile time number of        |
-|                                                    |               | blocks                          |
+|                                                    | launch (loop) | policy above but with a         |
+|                                                    |               | compile time number of blocks   |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_block_size_y_direct<ny_blocks>            | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in y-dim              |
@@ -385,13 +433,38 @@ policies have the prefix ``hip_``.
 | cuda/hip_block_size_z_direct<nz_blocks>            | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in z-dim              |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_x_direct                           | kernel (For)  | Creates a unique thread         |
-|                                                    | launch (loop) | id for each thread on           |
-|                                                    |               | x-dimension of the grid.        |
+| cuda/hip_block_size_x_loop<nx_blocks>              | kernel (For)  | Same as block_x_loop            |
+|                                                    | launch (loop) | policy above but with a         |
+|                                                    |               | compile time number of blocks   |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_y_loop<ny_blocks>              | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in y-dim              |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_z_loop<nz_blocks>              | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in z-dim              |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_x_unchecked                        | kernel (For)  | Map loop iterates unchecked     |
+|                                                    | launch (loop) | to GPU threads in the grid in   |
+|                                                    |               | the x-dimension, one iterate    |
+|                                                    |               | per thread. Creates a unique    |
+|                                                    |               | thread id for each thread on    |
+|                                                    |               | the x-dimension of the grid.    |
 |                                                    |               | Same as computing               |
 |                                                    |               | threadIdx.x +                   |
 |                                                    |               | threadDim.x * blockIdx.x.       |
 +----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_y_unchecked                        | kernel (For)  | Same as above, but uses         |
+|                                                    | launch (loop) | globals in y-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_z_unchecked                        | kernel (For)  | Same as above, but uses         |
+|                                                    | launch (loop) | globals in z-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_x_direct                           | kernel (For)  | Same as global_x_unchecked      |
+|                                                    | launch (loop) | above, but maps loop iterates   |
+|                                                    | launch (loop) | directly to GPU threads in the  |
+|                                                    |               | grid, one or no iterates per    |
+|                                                    |               | thread.                         |
++----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_global_y_direct                           | kernel (For)  | Same as above, but uses         |
 |                                                    | launch (loop) | globals in y-dimension.         |
 +----------------------------------------------------+---------------+---------------------------------+
@@ -409,6 +482,17 @@ policies have the prefix ``hip_``.
 | cuda/hip_global_z_loop                             | kernel (For)  | Same as above, but use          |
 |                                                    | launch (loop) | globals in z-dimension          |
 +----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_x_unchecked<nx_threads>       | kernel (For)  | Same as global_x_unchecked      |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time block            |
+|                                                    |               | size.                           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_y_unchecked<ny_threads>       | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in y-dim             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_z_unchecked<nz_threads>       | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in z-dim             |
++----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_global_size_x_direct<nx_threads>          | kernel (For)  | Same as global_x_direct         |
 |                                                    | launch (loop) | policy above but with           |
 |                                                    |               | a compile time block            |
@@ -420,24 +504,34 @@ policies have the prefix ``hip_``.
 | cuda/hip_global_size_z_direct<nz_threads>          | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to globals in z-dim             |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_warp_direct                               | kernel (For)  | Map work to threads             |
-|                                                    |               | in a warp directly.             |
-|                                                    |               | Cannot be used in               |
-|                                                    |               | conjunction with                |
-|                                                    |               | cuda/hip_thread_x_*             |
+| cuda/hip_global_size_x_loop<nx_threads>            | kernel (For)  | Same as global_x_loop           |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time block            |
+|                                                    |               | size.                           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_y_loop<ny_threads>            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in y-dim             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_z_loop<nz_threads>            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in z-dim             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_unchecked                            | kernel (For)  | Map work to threads in a        |
+|                                                    |               | warp unchecked.                 |
+|                                                    |               | Cannot be used in conjunction   |
+|                                                    |               | with cuda/hip_thread_x_*        |
 |                                                    |               | policies.                       |
 |                                                    |               | Multiple warps can be           |
 |                                                    |               | created by using                |
 |                                                    |               | cuda/hip_thread_y/z_*           |
 |                                                    |               | policies.                       |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_warp_loop                                 | kernel (For)  | Map work to threads in a warp   |
+| cuda/hip_warp_direct                               | kernel (For)  | Similar to warp_unchecked, but  |
+|                                                    |               | map work to threads             |
+|                                                    |               | in a warp directly.             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_loop                                 | kernel (For)  | Similar to warp_direct, but     |
+|                                                    |               | map work to threads in a warp   |
 |                                                    |               | using a warp-stride loop.       |
-|                                                    |               | Cannot be used with             |
-|                                                    |               | cuda/hip_thread_x_* policies.   |
-|                                                    |               | Multiple warps can be created   |
-|                                                    |               | by using cuda/hip_thread_y/z_*  |
-|                                                    |               | policies.                       |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_warp_masked_direct<BitMask<..>>           | kernel        | Mmap work directly to threads   |
 |                                                    | (For)         | in a warp using a bit mask.     |
@@ -495,9 +589,31 @@ policies:
 |                                                    |  BLOCKS_PER_SM_OFFSET) * sm_per_device  |
 +----------------------------------------------------+-----------------------------------------+
 
+Several notable constraints apply to RAJA CUDA/HIP *unchecked* policies.
+
+.. note:: * Unchecked policies do not mask out threads that are out-of-range.
+            So they should only be used when the size of the range matches the
+            size of the block or grid.
+          * Repeating unchecked policies with the same dimension in perfectly
+            nested loops is not recommended. Your code may do something, but
+            likely will not do what you expect and/or be correct.
+          * If multiple unchecked policies are used in a kernel (using different
+            dimensions), the product of sizes of the corresponding iteration
+            spaces cannot be greater than the maximum allowable threads per
+            block or blocks per grid. Typically, this is 1024 threads per
+            block. Attempting to execute a kernel with more than the maximum
+            allowed causes the CUDA/HIP runtime to complain about
+            *illegal launch parameters.*
+          * **Block-unchecked policies are recommended for most tiled loop
+            patterns. In these cases the CUDA/HIP kernel is launched with the
+            exact number of blocks needed so no checking is necessary.**
+
 Several notable constraints apply to RAJA CUDA/HIP *direct* policies.
 
-.. note:: * Repeating direct policies with the same dimension in perfectly
+.. note:: * Direct policies mask out threads that are out-of-range.
+            So they should only be used when the size of the range is less than
+            or equal to the size of the block or grid.
+          * Repeating direct policies with the same dimension in perfectly
             nested loops is not recommended. Your code may do something, but
             likely will not do what you expect and/or be correct.
           * If multiple direct policies are used in a kernel (using different
@@ -516,7 +632,10 @@ Several notable constraints apply to RAJA CUDA/HIP *direct* policies.
 
 Several notes regarding CUDA/HIP *loop* policies are also good to know.
 
-.. note:: * There is no constraint on the product of sizes of the associated
+.. note:: * Loop policies perform a block or grid stride loop.
+            So they can be used when the size of the range exceeds the size of
+            the block or grid.
+          * There is no constraint on the product of sizes of the associated
             loop iteration space.
           * These polices allow having a larger number of iterates than
             threads/blocks in the x, y, or z dimension.
@@ -529,9 +648,10 @@ Several notes regarding CUDA/HIP *loop* policies are also good to know.
 
 Finally
 
-.. note:: CUDA/HIP block-direct policies may be preferable to block-loop
-          policies in situations where block load balancing may be an issue
-          as the block-direct policies may yield better performance.
+.. note:: CUDA/HIP block-unchecked or block-direct policies may be preferable
+          to block-loop policies in situations where block load balancing may
+          be an issue as the block-unchecked or block-direct policies may yield
+          better performance.
 
 Several notes regarding the CUDA/HIP policy implementation that allow you to
 write more explicit policies.
@@ -541,7 +661,7 @@ write more explicit policies.
             behavior of the policy.
           * Policies have a mapping from loop iterations to iterates in the
             index set via a iteration_mapping enum template parameter. The
-            possible values are Direct and StridedLoop.
+            possible values are Unchecked, Direct, and StridedLoop.
           * Policies can be safely used with some synchronization constructs
             via a kernel_sync_requirement enum template parameter. The
             possible values are none and sync.

From 7cad03b0e5a3452192bac2bf9d419b748cee51ca Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 25 Aug 2024 21:28:16 -0700
Subject: [PATCH 08/15] Add cuda explicit implementation

---
 include/RAJA/policy/cuda/kernel/For.hpp       |   59 +
 include/RAJA/policy/cuda/kernel/ForICount.hpp |   52 +
 include/RAJA/policy/cuda/kernel/Tile.hpp      |   84 ++
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |   67 +
 include/RAJA/policy/cuda/kernel/internal.hpp  |  179 +++
 include/RAJA/policy/cuda/launch.hpp           |  261 ++++
 include/RAJA/policy/cuda/policy.hpp           | 1137 +++++------------
 7 files changed, 1000 insertions(+), 839 deletions(-)

diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 58ffa1ba14..abefacd9e5 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -30,6 +30,65 @@ namespace RAJA
 namespace internal
 {
 
+/*
+ * Executor for work sharing inside CudaKernel.
+ * Mapping without checking from IndexMapper to indices
+ * Assigns the loop index to offset ArgumentId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::For<ArgumentId,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                   EnclosedStmts...>,
+    Types> {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
+  using enclosed_stmts_t =
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
+  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    // Assign the index to the argument
+    data.template assign_offset<ArgumentId>(i);
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+  }
+
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+    const diff_t len = segment_length<ArgumentId>(data);
+
+    CudaDims my_dims(0), my_min_dims(0);
+    DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
+
+    // combine with enclosed statements
+    LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
+    return dims.max(enclosed_dims);
+  }
+};
+
 /*
  * Executor for work sharing inside CudaKernel.
  * Mapping directly from IndexMapper to indices
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 87556ed8b1..85b82a9cc6 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -30,6 +30,58 @@ namespace RAJA
 namespace internal
 {
 
+/*
+ * Executor for work sharing inside CudaKernel.
+ * Provides an unchecked mapping.
+ * Assigns the loop index to offset ArgumentId
+ * Assigns the loop index to param ParamId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                         EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
+
+  using Base = CudaStatementExecutor<
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                     EnclosedStmts...>,
+      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // grid stride loop
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    // Assign the index to the argument and param
+    data.template assign_offset<ArgumentId>(i);
+    data.template assign_param<ParamId>(i);
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+  }
+};
+
 /*
  * Executor for work sharing inside CudaKernel.
  * Provides a direct mapping.
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad901f6b02..865e476da0 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -42,6 +42,90 @@ namespace RAJA
 namespace internal
 {
 
+/*!
+ * A specialized RAJA::kernel cuda_impl executor for statement::Tile
+ * Assigns the tile segment to segment ArgumentId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
+  {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+
+    // Keep copy of original segment, so we can restore it
+    segment_t orig_segment = segment;
+
+    // Assign our new tiled segment
+    segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+    // Compute how many chunks
+    const diff_t full_len = segment_length<ArgumentId>(data);
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+
+    CudaDims my_dims(0), my_min_dims(0);
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
+
+    // privatize data, so we can mess with the segments
+    using data_t = camp::decay<Data>;
+    data_t private_data = data;
+
+    // Get original segment
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+
+    // restrict to first tile
+    segment = segment.slice(0, static_cast<diff_t>(chunk_size));
+
+    // NOTE: We do not detect improper uses of unchecked policies under tiling.
+    // This happens when using an unchecked policy on a tiled range that is not
+    // evenly divisible by chunk_size.
+    LaunchDims enclosed_dims =
+        enclosed_stmts_t::calculateDimensions(private_data);
+
+    return dims.max(enclosed_dims);
+  }
+};
+
 /*!
  * A specialized RAJA::kernel cuda_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index c611346d46..513b2fded4 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -42,6 +42,73 @@ namespace RAJA
 namespace internal
 {
 
+/*!
+ * A specialized RAJA::kernel cuda_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ * Meets all sync requirements
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
+    : public CudaStatementExecutor<
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
+
+  using Base = CudaStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                      EnclosedStmts...>,
+                      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(chunk_size);
+
+    // Keep copy of original segment, so we can restore it
+    segment_t orig_segment = segment;
+
+    // Assign our new tiled segment
+    segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
+    data.template assign_param<ParamId>(t);
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, thread_active);
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
 /*!
  * A specialized RAJA::kernel cuda_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 9c904ea45a..09be99c506 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -215,6 +215,185 @@ using cuda_statement_list_executor_t = CudaStatementListExecutor<
 template<typename kernel_indexer>
 struct KernelDimensionCalculator;
 
+// specialization for unchecked sequential policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+{
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  {
+    if ( len != static_cast<IdxT>(1) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the directly mapped index space");
+    }
+  }
+};
+
+// specialization for unchecked thread policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+{
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  {
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
+  }
+};
+///
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+{
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  {
+    if ( len != static_cast<IdxT>(IndexMapper::block_size) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+  }
+};
+
+// specialization for unchecked block policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+{
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  {
+    set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(len));
+  }
+};
+///
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+{
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  {
+    if ( len != static_cast<IdxT>(IndexMapper::grid_size) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+  }
+};
+
+// specialization for unchecked global policies
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+{
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  {
+    if (len != static_cast<IdxT>(0)) {
+      RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
+    }
+  }
+};
+///
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+{
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  {
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    const IdxT block_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
+    if ( len != (block_size * static_cast<IdxT>(IndexMapper::grid_size)) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_cuda_dim<dim>(dims.threads, block_size);
+    set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads, block_size);
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+  }
+};
+///
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+{
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  {
+    const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size));
+    if ( len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks, grid_size);
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks, grid_size);
+  }
+};
+///
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+{
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+
+  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  {
+    if ( len != (static_cast<IdxT>(IndexMapper::block_size) *
+                 static_cast<IdxT>(IndexMapper::grid_size)) ) {
+      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+    }
+    set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+  }
+};
+
+
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 75e5f6902b..c123d5ce08 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -360,6 +360,77 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
 /*
    CUDA generic loop implementations
 */
+template <typename SEGMENT, typename IndexMapper>
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE
+  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+            SEGMENT const &segment,
+            BODY const &body)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    body(*(segment.begin() + i));
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+
+    body(*(segment0.begin() + i0), *(segment1.begin() + i1));
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+    const diff_t i2 = IndexMapper2::template index<diff_t>();
+
+    body(*(segment0.begin() + i0),
+         *(segment1.begin() + i1),
+         *(segment2.begin() + i2));
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
@@ -550,6 +621,83 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
   }
 };
 
+
+/*
+   CUDA generic loop_icount implementations
+*/
+template <typename SEGMENT, typename IndexMapper>
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>();
+
+    body(*(segment.begin() + i), i);
+  }
+};
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+
+    body(*(segment0.begin() + i0),
+         *(segment1.begin() + i1),
+         i0, i1);
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>();
+    const diff_t i1 = IndexMapper1::template index<diff_t>();
+    const diff_t i2 = IndexMapper2::template index<diff_t>();
+
+    body(*(segment0.begin() + i0),
+         *(segment1.begin() + i1),
+         *(segment2.begin() + i2),
+         i0, i1, i2);
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
@@ -748,6 +896,72 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 /*
    CUDA generic flattened loop implementations
 */
+template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+                                                          sync,
+                                                          IndexMapper0>,
+                   SEGMENT>
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  sync,
+                                                  IndexMapper0>,
+                   SEGMENT>
+{};
+
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1>,
+                   SEGMENT>
+{
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int i0 = IndexMapper0::template index<diff_t>();
+    const int i1 = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>();
+
+    const int i = i0 + i0_stride*i1;
+
+    body(*(segment.begin() + i));
+  }
+};
+
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1,
+                                                          IndexMapper2>,
+                   SEGMENT>
+{
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int i0 = IndexMapper0::template index<diff_t>();
+    const int i1 = IndexMapper1::template index<diff_t>();
+    const int i2 = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>();
+    const diff_t i1_stride = IndexMapper1::template size<diff_t>();
+
+    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+
+    body(*(segment.begin() + i));
+  }
+};
+
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
 struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           sync,
@@ -902,6 +1116,27 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 /*
    CUDA generic tile implementations
 */
+template <typename SEGMENT, typename IndexMapper>
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+
+    body(segment.slice(i, static_cast<diff_t>(tile_size)));
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
@@ -951,6 +1186,32 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
   }
 };
 
+
+/*
+   CUDA generic tile_tcount implementations
+*/
+template <typename SEGMENT, typename IndexMapper>
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(tile_size);
+
+    body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index cd71a37480..b1d69bd139 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1554,6 +1554,10 @@ using cuda_multi_reduce_atomic_low_performance_low_overhead =
 using policy::cuda::cuda_block_reduce;
 using policy::cuda::cuda_warp_reduce;
 
+using cuda_warp_unchecked = RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Unchecked,
+    kernel_sync_requirement::none,
+    cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
 using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::Direct,
     kernel_sync_requirement::none,
@@ -1583,6 +1587,12 @@ using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
 
 
 // policies usable with kernel and launch
+template < typename ... indexers >
+using cuda_indexer_unchecked = policy::cuda::cuda_indexer<
+    iteration_mapping::Unchecked,
+    kernel_sync_requirement::none,
+    indexers...>;
+
 template < typename ... indexers >
 using cuda_indexer_direct = policy::cuda::cuda_indexer<
     iteration_mapping::Direct,
@@ -1601,6 +1611,12 @@ using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     kernel_sync_requirement::sync,
     indexers...>;
 
+template < typename ... indexers >
+using cuda_flatten_indexer_unchecked = policy::cuda::cuda_flatten_indexer<
+    iteration_mapping::Unchecked,
+    kernel_sync_requirement::none,
+    indexers...>;
+
 template < typename ... indexers >
 using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
     iteration_mapping::Direct,
@@ -1613,890 +1629,333 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
     kernel_sync_requirement::none,
     indexers...>;
 
-/*!
- * Maps segment indices to CUDA threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical threads to fit all of the direct map requests.
- * For example, a segment of size 2000 will not fit, and trigger a runtime
- * error.
- */
-template < named_dim ... dims >
-using cuda_thread_direct = cuda_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-using cuda_thread_x_direct = cuda_thread_direct<named_dim::x>;
-using cuda_thread_y_direct = cuda_thread_direct<named_dim::y>;
-using cuda_thread_z_direct = cuda_thread_direct<named_dim::z>;
-
-using cuda_thread_xy_direct = cuda_thread_direct<named_dim::x, named_dim::y>;
-using cuda_thread_xz_direct = cuda_thread_direct<named_dim::x, named_dim::z>;
-using cuda_thread_yx_direct = cuda_thread_direct<named_dim::y, named_dim::x>;
-using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
-using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
-using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
-
-using cuda_thread_xyz_direct = cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_direct = cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_direct = cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_direct = cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_direct = cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_direct = cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
-/*!
- * Maps segment indices to CUDA threads.
- * Uses block-stride looping to exceed the maximum number of physical threads
- */
-template < named_dim ... dims >
-using cuda_thread_loop = cuda_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-template < named_dim ... dims >
-using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-using cuda_thread_x_loop = cuda_thread_loop<named_dim::x>;
-using cuda_thread_y_loop = cuda_thread_loop<named_dim::y>;
-using cuda_thread_z_loop = cuda_thread_loop<named_dim::z>;
-
-using cuda_thread_xy_loop = cuda_thread_loop<named_dim::x, named_dim::y>;
-using cuda_thread_xz_loop = cuda_thread_loop<named_dim::x, named_dim::z>;
-using cuda_thread_yx_loop = cuda_thread_loop<named_dim::y, named_dim::x>;
-using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
-using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
-using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
-
-using cuda_thread_xyz_loop = cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_loop = cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_loop = cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_loop = cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_loop = cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+// helper to generate the many policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, scope, mapping) \
+  \
+  using cuda_##flatten##scope##_x_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x>; \
+  using cuda_##flatten##scope##_y_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y>; \
+  using cuda_##flatten##scope##_z_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z>; \
+  \
+  using cuda_##flatten##scope##_xy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::y>; \
+  using cuda_##flatten##scope##_xz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::z>; \
+  using cuda_##flatten##scope##_yx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::x>; \
+  using cuda_##flatten##scope##_yz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::z>; \
+  using cuda_##flatten##scope##_zx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::x>; \
+  using cuda_##flatten##scope##_zy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::y>; \
+  \
+  using cuda_##flatten##scope##_xyz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::y, named_dim::z>; \
+  using cuda_##flatten##scope##_xzy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::z, named_dim::y>; \
+  using cuda_##flatten##scope##_yxz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::x, named_dim::z>; \
+  using cuda_##flatten##scope##_yzx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::z, named_dim::x>; \
+  using cuda_##flatten##scope##_zxy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::x, named_dim::y>; \
+  using cuda_##flatten##scope##_zyx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::y, named_dim::x>;
+
+// helper to generate the many thread policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten, mapping) \
+  template < named_dim ... dims > \
+  using cuda_##flatten##thread_##mapping = cuda_##flatten##indexer_##mapping< \
+      cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>; \
+  \
+  RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, thread, mapping)
+
+// helper to generate the many block policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten, mapping) \
+  template < named_dim ... dims > \
+  using cuda_##flatten##block_##mapping = cuda_##flatten##indexer_##mapping< \
+      cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>; \
+  \
+  RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, block, mapping)
+
+// helper to generate the many global policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten, mapping) \
+  template < named_dim ... dims > \
+  using cuda_##flatten##global_##mapping = cuda_##flatten##indexer_##mapping< \
+      cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>; \
+  \
+  RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, global, mapping)
 
-/*
- * Maps segment indices to flattened CUDA threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical threads to fit all of the direct map requests.
- * Reshapes multiple physical threads into a 1D iteration space
- */
-template < named_dim ... dims >
-using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
-
-using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
-using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
-using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
-
-using cuda_flatten_thread_xy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
-/*
- * Maps segment indices to flattened CUDA threads.
- * Reshapes multiple physical threads into a 1D iteration space
- * Uses block-stride looping to exceed the maximum number of physical threads
+/*!
+ * Maps segment indices to CUDA threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads, blocks, or global threads as map requests.
+ * For example, a segment of size 1000 will only fit into 1000 threads, blocks, or global threads, and
+ * triggers a runtime error in some cases.
  */
-template < named_dim ... dims >
-using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, unchecked)
 
-using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
-using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
-using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(, unchecked)
 
-using cuda_flatten_thread_xy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, unchecked)
 
-using cuda_flatten_thread_xyz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+/*!
+ * Maps segment indices to CUDA threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
+ * physical threads, blocks, or global threads to fit all of the direct map requests.
+ * For example, a segment of size 2000 will not fit into 1024 threads, blocks,
+ * or global threads, and triggers a runtime error in some cases.
+ */
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, direct)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(, direct)
 
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, direct)
 
 /*!
- * Maps segment indices to CUDA blocks.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical blocks to fit all of the direct map requests.
+ * Maps segment indices to CUDA threads, blocks, or global threads.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads.
  */
-template < named_dim ... dims >
-using cuda_block_direct = cuda_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-using cuda_block_x_direct = cuda_block_direct<named_dim::x>;
-using cuda_block_y_direct = cuda_block_direct<named_dim::y>;
-using cuda_block_z_direct = cuda_block_direct<named_dim::z>;
-
-using cuda_block_xy_direct = cuda_block_direct<named_dim::x, named_dim::y>;
-using cuda_block_xz_direct = cuda_block_direct<named_dim::x, named_dim::z>;
-using cuda_block_yx_direct = cuda_block_direct<named_dim::y, named_dim::x>;
-using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
-using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
-using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
-
-using cuda_block_xyz_direct = cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_direct = cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_direct = cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_direct = cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_direct = cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_direct = cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(, loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, loop)
 
 /*!
- * Maps segment indices to CUDA blocks.
- * Uses grid-stride looping to exceed the maximum number of blocks
+ * Only used in the "kernel" abstraction.
+ * Maps segment indices to CUDA threads, blocks, or global threads.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads.
+ * Allow synchronization in the loop, do not mask any threads out.
  */
-template < named_dim ... dims >
-using cuda_block_loop = cuda_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
-    cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-using cuda_block_x_loop = cuda_block_loop<named_dim::x>;
-using cuda_block_y_loop = cuda_block_loop<named_dim::y>;
-using cuda_block_z_loop = cuda_block_loop<named_dim::z>;
-
-using cuda_block_xy_loop = cuda_block_loop<named_dim::x, named_dim::y>;
-using cuda_block_xz_loop = cuda_block_loop<named_dim::x, named_dim::z>;
-using cuda_block_yx_loop = cuda_block_loop<named_dim::y, named_dim::x>;
-using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
-using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
-using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
-
-using cuda_block_xyz_loop = cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_loop = cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_loop = cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_loop = cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_loop = cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, syncable_loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(, syncable_loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, syncable_loop)
+
 
 /*
- * Maps segment indices to flattened CUDA blocks.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical blocks to fit all of the direct map requests.
- * Reshapes multiple physical blocks into a 1D iteration space
+ * Maps segment indices to flattened CUDA threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads, blocks, or global threads as map requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space
  */
-template < named_dim ... dims >
-using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
-
-using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
-using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
-using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
-
-using cuda_flatten_block_xy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten_, unchecked)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, unchecked)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, unchecked)
 
 /*
- * Maps segment indices to flattened CUDA blocks.
- * Reshapes multiple physical blocks into a 1D iteration space
- * Uses block-stride looping to exceed the maximum number of physical blocks
+ * Maps segment indices to flattened CUDA threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
+ * physical threads, blocks, or global threads to fit all of the direct map
+ * requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space
  */
-template < named_dim ... dims >
-using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten_, direct)
 
-using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
-using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
-using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, direct)
 
-using cuda_flatten_block_xy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, direct)
 
-using cuda_flatten_block_xyz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+/*
+ * Maps segment indices to flattened CUDA threads, blocks, or global threads.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads
+ */
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, loop)
+
+
+// helper to generate the many one size policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
+  \
+  template < int X_SIZE > \
+  using cuda_##flatten##scope##_size_x_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>>; \
+  template < int Y_SIZE > \
+  using cuda_##flatten##scope##_size_y_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>>; \
+  template < int Z_SIZE > \
+  using cuda_##flatten##scope##_size_z_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>>; \
+  \
+  template < int X_SIZE, int Y_SIZE > \
+  using cuda_##flatten##scope##_size_xy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_y<Y_SIZE>>; \
+  template < int X_SIZE, int Z_SIZE > \
+  using cuda_##flatten##scope##_size_xz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_z<Z_SIZE>>; \
+  template < int Y_SIZE, int X_SIZE > \
+  using cuda_##flatten##scope##_size_yx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_x<X_SIZE>>; \
+  template < int Y_SIZE, int Z_SIZE > \
+  using cuda_##flatten##scope##_size_yz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_z<Z_SIZE>>; \
+  template < int Z_SIZE, int X_SIZE > \
+  using cuda_##flatten##scope##_size_zx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_x<X_SIZE>>; \
+  template < int Z_SIZE, int Y_SIZE > \
+  using cuda_##flatten##scope##_size_zy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_y<Y_SIZE>>; \
+  \
+  template < int X_SIZE, int Y_SIZE, int Z_SIZE > \
+  using cuda_##flatten##scope##_size_xyz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_y<Y_SIZE>, cuda::scope##_z<Z_SIZE>>; \
+  template < int X_SIZE, int Z_SIZE, int Y_SIZE > \
+  using cuda_##flatten##scope##_size_xzy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_z<Z_SIZE>, cuda::scope##_y<Y_SIZE>>; \
+  template < int Y_SIZE, int X_SIZE, int Z_SIZE > \
+  using cuda_##flatten##scope##_size_yxz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_x<X_SIZE>, cuda::scope##_z<Z_SIZE>>; \
+  template < int Y_SIZE, int Z_SIZE, int X_SIZE > \
+  using cuda_##flatten##scope##_size_yzx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_z<Z_SIZE>, cuda::scope##_x<X_SIZE>>; \
+  template < int Z_SIZE, int X_SIZE, int Y_SIZE > \
+  using cuda_##flatten##scope##_size_zxy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_x<X_SIZE>, cuda::scope##_y<Y_SIZE>>; \
+  template < int Z_SIZE, int Y_SIZE, int X_SIZE > \
+  using cuda_##flatten##scope##_size_zyx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_y<Y_SIZE>, cuda::scope##_x<X_SIZE>>;
+
+// helper to generate the many two size policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
+  \
+  template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_x_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_y_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_z_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  \
+  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_xy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                       cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_xz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                       cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_yx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                       cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_yz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                       cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_zx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                       cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_zy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                       cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  \
+  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_xyz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_xzy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_yxz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
+  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_yzx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_zxy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
+                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
+  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
+             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
+  using cuda_##flatten##scope##_size_zyx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
+                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
+                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+
+// helper to generate the many thread size policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten, mapping) \
+    RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, thread, mapping)
+
+// helper to generate the many block size policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten, mapping) \
+    RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, block, mapping)
+
+// helper to generate the many global size policy aliases
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten, mapping) \
+    RAJA_INTERNAL_CUDA_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, global, mapping)
 
 
 /*!
- * Maps segment indices to CUDA global threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical threads to fit all of the direct map requests.
+ * Maps segment indices to CUDA threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads as the map requests.
  */
-template < named_dim ... dims >
-using cuda_global_direct = cuda_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
-using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
-using cuda_global_z_direct = cuda_global_direct<named_dim::z>;
-
-using cuda_global_xy_direct = cuda_global_direct<named_dim::x, named_dim::y>;
-using cuda_global_xz_direct = cuda_global_direct<named_dim::x, named_dim::z>;
-using cuda_global_yx_direct = cuda_global_direct<named_dim::y, named_dim::x>;
-using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
-using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
-using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
-
-using cuda_global_xyz_direct = cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_direct = cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_direct = cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_direct = cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_direct = cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_direct = cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, unchecked)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, unchecked)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, unchecked)
 
 /*!
- * Maps segment indices to CUDA global threads.
- * Uses grid-stride looping to exceed the maximum number of global threads
+ * Maps segment indices to CUDA threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
+ * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using cuda_global_loop = cuda_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using cuda_global_syncable_loop = cuda_indexer_syncable_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
-using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
-using cuda_global_z_loop = cuda_global_loop<named_dim::z>;
-
-using cuda_global_xy_loop = cuda_global_loop<named_dim::x, named_dim::y>;
-using cuda_global_xz_loop = cuda_global_loop<named_dim::x, named_dim::z>;
-using cuda_global_yx_loop = cuda_global_loop<named_dim::y, named_dim::x>;
-using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
-using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
-using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
-
-using cuda_global_xyz_loop = cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_loop = cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_loop = cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_loop = cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_loop = cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, direct)
 
-/*
- * Maps segment indices to flattened CUDA global threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical global threads to fit all of the direct map requests.
- * Reshapes multiple physical global threads into a 1D iteration space
- */
-template < named_dim ... dims >
-using cuda_flatten_global_direct = cuda_flatten_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
-using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
-using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
-
-using cuda_flatten_global_xy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, direct)
 
-/*
- * Maps segment indices to flattened CUDA global threads.
- * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
- */
-template < named_dim ... dims >
-using cuda_flatten_global_loop = cuda_flatten_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, direct)
 
-using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
-using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
-using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
+/*!
+ * Maps segment indices to CUDA threads, blocks, or global threads.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * threads, blocks, or global threads.
+ */
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, loop)
 
-using cuda_flatten_global_xy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, loop)
 
-using cuda_flatten_global_xyz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, loop)
 
 
-/*!
- * Maps segment indices to CUDA global threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical threads to fit all of the direct map requests.
+/*
+ * Maps segment indices to flattened CUDA threads, blocks, or global threads.
+ * This is the lowest overhead mapping, but requires that there are the same
+ * number of physical threads, blocks, or global threads as the map requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space.
  */
-template < int X_BLOCK_SIZE >
-using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, unchecked)
 
-/*!
- * Maps segment indices to CUDA global threads.
- * Uses grid-stride looping to exceed the maximum number of global threads
- */
-template < int X_BLOCK_SIZE >
-using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, unchecked)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, unchecked)
 
 /*
- * Maps segment indices to flattened CUDA global threads.
- * This is the lowest overhead mapping, but requires that there are enough
- * physical global threads to fit all of the direct map requests.
- * Reshapes multiple physical global threads into a 1D iteration space
+ * Maps segment indices to flattened CUDA threads, blocks, or global threads.
+ * This is a low overhead mapping, but requires that there are enough
+ * physical threads, blocks, or global threads to fit all of the direct map
+ * requests.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space.
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, direct)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, direct)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, direct)
 
 /*
- * Maps segment indices to flattened CUDA global threads.
- * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Maps segment indices to flattened CUDA threads, blocks, or global threads.
+ * Reshapes multiple physical threads, blocks, or global threads into a 1D
+ * iteration space.
+ * Uses block-stride or grid-stride looping to exceed the maximum number of
+ * physical threads, blocks, or global threads.
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, loop)
+
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, loop)
 
 
 /*

From 30847df667a342dd28c4e6d4400a4db820966c87 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 25 Aug 2024 21:27:23 -0700
Subject: [PATCH 09/15] Regularize spacing in hip/cuda

---
 include/RAJA/policy/cuda/launch.hpp | 17 +++++++----------
 include/RAJA/policy/hip/launch.hpp  |  7 +++----
 include/RAJA/policy/hip/policy.hpp  |  5 ++++-
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index c123d5ce08..2f49f68a96 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -166,7 +166,6 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
         using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>;
         RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
 
-
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
@@ -238,7 +237,6 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
   exec(RAJA::resources::Resource res, const LaunchParams &params,
        const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
-
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
@@ -288,14 +286,13 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
   }
 
   //Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  template <typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN && body_in, ReduceParams &launch_reducers)
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
   {
-
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
@@ -315,7 +312,6 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
                           static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
                           static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
 
-
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
     if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
@@ -329,8 +325,8 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
       launch_info.blockDim = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
       launch_info.res = cuda_res;
-      {
 
+      {
         using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
         RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
 
@@ -351,6 +347,7 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
 
       RAJA_FT_END;
     }
+
     return resources::EventProxy<resources::Resource>(res);
   }
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index e1cbd87497..e793899d2d 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -121,8 +121,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     return resources::EventProxy<resources::Resource>(res);
   }
 
-
- //Version with explicit reduction parameters..
+  //Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
                                RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
@@ -228,7 +227,6 @@ void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params
   RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
 }
 
-
 template <bool async, int nthreads>
 struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
@@ -267,6 +265,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
       {
         size_t shared_mem_size = params.shared_mem_size;
+
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
@@ -286,7 +285,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //Version with explicit reduction parameters..
+  //Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
                                RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 9872bea553..2a3c9176a8 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -405,7 +405,6 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-
 template < typename tuning >
 struct hip_reduce_policy
     : public RAJA::
@@ -1535,6 +1534,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
 
 // helper to generate the many policy aliases
 #define RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, scope, mapping) \
+  \
   using hip_##flatten##scope##_x_##mapping = hip_##flatten##scope##_##mapping<named_dim::x>; \
   using hip_##flatten##scope##_y_##mapping = hip_##flatten##scope##_##mapping<named_dim::y>; \
   using hip_##flatten##scope##_z_##mapping = hip_##flatten##scope##_##mapping<named_dim::z>; \
@@ -1558,6 +1558,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
   template < named_dim ... dims > \
   using hip_##flatten##thread_##mapping = hip_##flatten##indexer_##mapping< \
       hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>; \
+  \
   RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, thread, mapping)
 
 // helper to generate the many block policy aliases
@@ -1565,6 +1566,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
   template < named_dim ... dims > \
   using hip_##flatten##block_##mapping = hip_##flatten##indexer_##mapping< \
       hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>; \
+  \
   RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, block, mapping)
 
 // helper to generate the many global policy aliases
@@ -1572,6 +1574,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
   template < named_dim ... dims > \
   using hip_##flatten##global_##mapping = hip_##flatten##indexer_##mapping< \
       hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>; \
+  \
   RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, global, mapping)
 
 

From c0ac18e95048fa8c04a8127a5d8e3dddef402231 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 26 Aug 2024 09:44:07 -0700
Subject: [PATCH 10/15] Fill out 2d and 3d loop and tile implementations

There were a number missing for cuda/hip
---
 include/RAJA/pattern/launch/launch_core.hpp |  87 ++--
 include/RAJA/policy/cuda/launch.hpp         | 446 ++++++++++++++++++++
 include/RAJA/policy/hip/launch.hpp          | 446 ++++++++++++++++++++
 3 files changed, 955 insertions(+), 24 deletions(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 7ea7ce57ef..ff10f04dae 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -547,8 +547,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
 {
 
   LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment,
-                                                       body);
+      segment, body);
 }
 
 template <typename POLICY_LIST,
@@ -561,8 +560,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
 {
 
   LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          segment,
-                                                          body);
+      segment, body);
 }
 
 namespace expt
@@ -580,9 +578,22 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
 {
 
   LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
-                                                       body);
+      segment0, segment1, body);
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       BODY const &body)
+{
+
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+      segment0, segment1, body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -617,7 +628,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
 {
 
   LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                           segment0, segment1, segment2, body);
+      segment0, segment1, segment2, body);
 }
 
 } //namespace expt
@@ -640,9 +651,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size,
-                                                       segment,
-                                                       body);
+      tile_size, segment, body);
 }
 
 template <typename POLICY_LIST,
@@ -656,9 +665,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
                                        BODY const &body)
 {
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size,
-                                                          segment,
-                                                          body);
+      tile_size, segment, body);
 }
 
 namespace expt
@@ -678,11 +685,44 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size0,
-                                                       tile_size1,
-                                                       segment0,
-                                                       segment1,
-                                                       body);
+      tile_size0, tile_size1, segment0, segment1, body);
+}
+
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename TILE_T,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
+                                       TILE_T tile_size0,
+                                       TILE_T tile_size1,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       BODY const &body)
+{
+
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+      tile_size0, tile_size1, segment0, segment1, body);
+}
+
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename TILE_T,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+                                       TILE_T tile_size0,
+                                       TILE_T tile_size1,
+                                       TILE_T tile_size2,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       SEGMENT const &segment2,
+                                       BODY const &body)
+{
+
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+      tile_size0, tile_size1, tile_size2,
+      segment0, segment1, segment2, body);
 }
 
 template <typename POLICY_LIST,
@@ -693,17 +733,16 @@ template <typename POLICY_LIST,
 RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
                                        TILE_T tile_size0,
                                        TILE_T tile_size1,
+                                       TILE_T tile_size2,
                                        SEGMENT const &segment0,
                                        SEGMENT const &segment1,
+                                       SEGMENT const &segment2,
                                        BODY const &body)
 {
 
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size0,
-                                                          tile_size1,
-                                                          segment0,
-                                                          segment1,
-                                                          body);
+      tile_size0, tile_size1, tile_size2,
+      segment0, segment1, segment2, body);
 }
 
 } //namespace expt
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 2f49f68a96..fea2845e57 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -1134,6 +1134,63 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unc
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)));
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+         segment2.slice(i2, static_cast<diff_t>(tile_size2)));
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
@@ -1158,6 +1215,74 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    if (i0 < len0 && i1 < len1) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)));
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+           segment2.slice(i2, static_cast<diff_t>(tile_size2)));
+    }
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
@@ -1183,6 +1308,87 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+        body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+             segment1.slice(i1, static_cast<diff_t>(tile_size1)));
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2_init = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2_stride = IndexMapper2::template size<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+          body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+               segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+               segment2.slice(i2, static_cast<diff_t>(tile_size2)));
+        }
+      }
+    }
+  }
+};
+
 
 /*
    CUDA generic tile_tcount implementations
@@ -1209,6 +1415,72 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+         t0, t1);
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+    const diff_t t2 = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = t2 * static_cast<diff_t>(tile_size2);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+         segment2.slice(i2, static_cast<diff_t>(tile_size2)),
+         t0, t1, t2);
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
@@ -1234,6 +1506,83 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+
+    if (i0 < len0 && i1 < len1) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+           t0, t1);
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+    const diff_t t2 = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = t2 * static_cast<diff_t>(tile_size2);
+
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+           segment2.slice(i2, static_cast<diff_t>(tile_size2)),
+           t0, t1, t2);
+    }
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
@@ -1261,5 +1610,102 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t t0_init = IndexMapper0::template index<diff_t>();
+    const diff_t t1_init = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0_init = t0_init * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = t1_init * static_cast<diff_t>(tile_size1);
+
+    const diff_t t0_stride = IndexMapper0::template size<diff_t>();
+    const diff_t t1_stride = IndexMapper1::template size<diff_t>();
+
+    const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
+
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
+        body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+             segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+             t0, t1);
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t t0_init = IndexMapper0::template index<diff_t>();
+    const diff_t t1_init = IndexMapper1::template index<diff_t>();
+    const diff_t t2_init = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0_init = t0_init * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = t1_init * static_cast<diff_t>(tile_size1);
+    const diff_t i2_init = t2_init * static_cast<diff_t>(tile_size2);
+
+    const diff_t t0_stride = IndexMapper0::template size<diff_t>();
+    const diff_t t1_stride = IndexMapper1::template size<diff_t>();
+    const diff_t t2_stride = IndexMapper2::template size<diff_t>();
+
+    const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
+    const diff_t i2_stride = t2_stride * static_cast<diff_t>(tile_size2);
+
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
+        for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2; i2 += i2_stride, t2 += t2_stride) {
+          body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+               segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+               segment2.slice(i2, static_cast<diff_t>(tile_size2)),
+               t0, t1, t2);
+        }
+      }
+    }
+  }
+};
+
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index e793899d2d..18ab91526d 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -1134,6 +1134,63 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unche
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)));
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+         segment2.slice(i2, static_cast<diff_t>(tile_size2)));
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
@@ -1158,6 +1215,74 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    if (i0 < len0 && i1 < len1) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)));
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+           segment2.slice(i2, static_cast<diff_t>(tile_size2)));
+    }
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
@@ -1183,6 +1308,87 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+        body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+             segment1.slice(i1, static_cast<diff_t>(tile_size1)));
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2_init = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2_stride = IndexMapper2::template size<diff_t>() * static_cast<diff_t>(tile_size2);
+
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+          body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+               segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+               segment2.slice(i2, static_cast<diff_t>(tile_size2)));
+        }
+      }
+    }
+  }
+};
+
 
 /*
    HIP generic tile_tcount implementations
@@ -1209,6 +1415,72 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+         t0, t1);
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+    const diff_t t2 = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = t2 * static_cast<diff_t>(tile_size2);
+
+    body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+         segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+         segment2.slice(i2, static_cast<diff_t>(tile_size2)),
+         t0, t1, t2);
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
@@ -1234,6 +1506,83 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+
+    if (i0 < len0 && i1 < len1) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+           t0, t1);
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t t0 = IndexMapper0::template index<diff_t>();
+    const diff_t t1 = IndexMapper1::template index<diff_t>();
+    const diff_t t2 = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0 = t0 * static_cast<diff_t>(tile_size0);
+    const diff_t i1 = t1 * static_cast<diff_t>(tile_size1);
+    const diff_t i2 = t2 * static_cast<diff_t>(tile_size2);
+
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+           segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+           segment2.slice(i2, static_cast<diff_t>(tile_size2)),
+           t0, t1, t2);
+    }
+  }
+};
+
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
@@ -1261,5 +1610,102 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+
+    const diff_t t0_init = IndexMapper0::template index<diff_t>();
+    const diff_t t1_init = IndexMapper1::template index<diff_t>();
+
+    const diff_t i0_init = t0_init * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = t1_init * static_cast<diff_t>(tile_size1);
+
+    const diff_t t0_stride = IndexMapper0::template size<diff_t>();
+    const diff_t t1_stride = IndexMapper1::template size<diff_t>();
+
+    const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
+
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
+        body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+             segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+             t0, t1);
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size0,
+      TILE_T tile_size1,
+      TILE_T tile_size2,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const diff_t len0 = segment0.end() - segment0.begin();
+    const diff_t len1 = segment1.end() - segment1.begin();
+    const diff_t len2 = segment2.end() - segment2.begin();
+
+    const diff_t t0_init = IndexMapper0::template index<diff_t>();
+    const diff_t t1_init = IndexMapper1::template index<diff_t>();
+    const diff_t t2_init = IndexMapper2::template index<diff_t>();
+
+    const diff_t i0_init = t0_init * static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = t1_init * static_cast<diff_t>(tile_size1);
+    const diff_t i2_init = t2_init * static_cast<diff_t>(tile_size2);
+
+    const diff_t t0_stride = IndexMapper0::template size<diff_t>();
+    const diff_t t1_stride = IndexMapper1::template size<diff_t>();
+    const diff_t t2_stride = IndexMapper2::template size<diff_t>();
+
+    const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
+    const diff_t i2_stride = t2_stride * static_cast<diff_t>(tile_size2);
+
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
+        for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2; i2 += i2_stride, t2 += t2_stride) {
+          body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
+               segment1.slice(i1, static_cast<diff_t>(tile_size1)),
+               segment2.slice(i2, static_cast<diff_t>(tile_size2)),
+               t0, t1, t2);
+        }
+      }
+    }
+  }
+};
+
 }  // namespace RAJA
 #endif

From fac8dec9ece6f668849deddcb1e51e06eba27f07 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 12 Sep 2024 09:09:55 -0700
Subject: [PATCH 11/15] Add testing for unchecked policies

This adds testing for unchecked policies with cuda and hip
in kernel and launch.
---
 test/functional/kernel/CMakeLists.txt         |   9 +-
 .../test-kernel-nested-loop.cpp.in            |   8 +-
 .../test-kernel-nested-loop-segments.cpp.in   |   4 +-
 .../test-kernel-nested-loop-view.cpp.in       |  16 +-
 .../test-kernel-nested-loop.cpp.in            |  12 +-
 .../CMakeLists.txt                            |  10 +-
 .../test-kernel-tile-count-direct.cpp.in}     |  20 +-
 .../test-kernel-tile-ForICount-direct.hpp}    |  22 +-
 .../test-kernel-tile-TileTCount-direct.hpp}   |  22 +-
 .../tile-icount-tcount-loop/CMakeLists.txt    |  36 +++
 .../test-kernel-tile-count-loop.cpp.in        | 246 ++++++++++++++++++
 .../tests/test-kernel-tile-ForICount-loop.hpp |  83 ++++++
 .../test-kernel-tile-TileTCount-loop.hpp      |  85 ++++++
 .../CMakeLists.txt                            |  36 +++
 .../test-kernel-tile-count-unchecked.cpp.in   | 119 +++++++++
 .../test-kernel-tile-ForICount-unchecked.hpp  |  85 ++++++
 .../test-kernel-tile-TileTCount-unchecked.hpp |  87 +++++++
 .../tile-variants/test-kernel-tiledyn.cpp.in  |   4 +-
 .../test-kernel-tilefixed.cpp.in              |   4 +-
 .../test-kernel-tilelocal.cpp.in              |  18 +-
 test/functional/launch/CMakeLists.txt         |  17 +-
 .../nested_tile_unchecked/CMakeLists.txt      |  22 ++
 .../test-launch-nested-tile-unchecked.cpp.in  |  39 +++
 .../test-launch-nested-Tile-Unchecked.hpp     | 145 +++++++++++
 .../launch/nested_unchecked/CMakeLists.txt    |  30 +++
 .../test-launch-nested.cpp.in                 |  39 +++
 .../tests/test-launch-nested-Unchecked.hpp    | 150 +++++++++++
 .../CMakeLists.txt                            |  10 +-
 ...h-nested-tile-icount-tcount-direct.cpp.in} |   2 +-
 ...unch-nested-Tile-iCount-tCount-Direct.hpp} |   6 +-
 .../CMakeLists.txt                            |  10 +-
 ...nch-nested-tile-icount-tcount-loop.cpp.in} |   2 +-
 ...launch-nested-Tile-iCount-tCount-Loop.hpp} |   6 +-
 .../CMakeLists.txt                            |  22 ++
 ...nested-tile-icount-tcount-unchecked.cpp.in |  39 +++
 ...ch-nested-Tile-iCount-tCount-Unchecked.hpp | 138 ++++++++++
 ...launch-direct-teams-threads-1D-execpol.hpp |   4 +-
 ...nch-unchecked-teams-threads-1D-execpol.hpp |  56 ++++
 ...nch-unchecked-teams-threads-3D-execpol.hpp |  68 +++++
 39 files changed, 1642 insertions(+), 89 deletions(-)
 rename test/functional/kernel/{single-loop-tile-icount-tcount => tile-icount-tcount-direct}/CMakeLists.txt (74%)
 rename test/functional/kernel/{single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in => tile-icount-tcount-direct/test-kernel-tile-count-direct.cpp.in} (93%)
 rename test/functional/kernel/{single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp => tile-icount-tcount-direct/tests/test-kernel-tile-ForICount-direct.hpp} (73%)
 rename test/functional/kernel/{single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp => tile-icount-tcount-direct/tests/test-kernel-tile-TileTCount-direct.hpp} (73%)
 create mode 100644 test/functional/kernel/tile-icount-tcount-loop/CMakeLists.txt
 create mode 100644 test/functional/kernel/tile-icount-tcount-loop/test-kernel-tile-count-loop.cpp.in
 create mode 100644 test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-ForICount-loop.hpp
 create mode 100644 test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-TileTCount-loop.hpp
 create mode 100644 test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt
 create mode 100644 test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in
 create mode 100644 test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp
 create mode 100644 test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp
 create mode 100644 test/functional/launch/nested_tile_unchecked/CMakeLists.txt
 create mode 100644 test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in
 create mode 100644 test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
 create mode 100644 test/functional/launch/nested_unchecked/CMakeLists.txt
 create mode 100644 test/functional/launch/nested_unchecked/test-launch-nested.cpp.in
 create mode 100644 test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
 rename test/functional/launch/{tile_icount_loop => tile_icount_tcount_direct}/CMakeLists.txt (60%)
 rename test/functional/launch/{tile_icount_direct/test-launch-nested-tile-icount-direct.cpp.in => tile_icount_tcount_direct/test-launch-nested-tile-icount-tcount-direct.cpp.in} (95%)
 rename test/functional/launch/{tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp => tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp} (96%)
 rename test/functional/launch/{tile_icount_direct => tile_icount_tcount_loop}/CMakeLists.txt (61%)
 rename test/functional/launch/{tile_icount_loop/test-launch-nested-tile-icount-loop.cpp.in => tile_icount_tcount_loop/test-launch-nested-tile-icount-tcount-loop.cpp.in} (95%)
 rename test/functional/launch/{tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp => tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp} (97%)
 create mode 100644 test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt
 create mode 100644 test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in
 create mode 100644 test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
 create mode 100644 test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp
 create mode 100644 test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp

diff --git a/test/functional/kernel/CMakeLists.txt b/test/functional/kernel/CMakeLists.txt
index 76771724c9..b7951b822d 100644
--- a/test/functional/kernel/CMakeLists.txt
+++ b/test/functional/kernel/CMakeLists.txt
@@ -6,6 +6,7 @@
 ###############################################################################
 
 list(APPEND KERNEL_BACKENDS Sequential)
+set(KERNEL_UNCHECKED_BACKENDS "")
 
 if(RAJA_ENABLE_OPENMP)
   list(APPEND KERNEL_BACKENDS OpenMP)
@@ -13,10 +14,12 @@ endif()
 
 if(RAJA_ENABLE_CUDA)
   list(APPEND KERNEL_BACKENDS Cuda)
+  list(APPEND KERNEL_UNCHECKED_BACKENDS Cuda)
 endif()
 
 if(RAJA_ENABLE_HIP)
   list(APPEND KERNEL_BACKENDS Hip)
+  list(APPEND KERNEL_UNCHECKED_BACKENDS Hip)
 endif()
 
 if(RAJA_ENABLE_SYCL)
@@ -49,7 +52,11 @@ add_subdirectory(nested-loop-view-types)
 
 add_subdirectory(reduce-loc)
 
-add_subdirectory(single-loop-tile-icount-tcount)
+add_subdirectory(tile-icount-tcount-unchecked)
+
+add_subdirectory(tile-icount-tcount-direct)
+
+add_subdirectory(tile-icount-tcount-loop)
 
 add_subdirectory(tile-variants)
 
diff --git a/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in b/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in
index 4816fc734b..6a493c2d6b 100644
--- a/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in
+++ b/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in
@@ -56,8 +56,8 @@ using CudaKernelNestedLoopExecPols = camp::list<
 
     // Device Depth 3 ReduceSum Exec Pols
     NestedLoopData<DEVICE_DEPTH_3_REDUCESUM, RAJA::cuda_block_x_loop, RAJA::cuda_thread_y_loop, RAJA::cuda_thread_z_loop >,
-    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::cuda_block_x_loop, RAJA::cuda_thread_y_loop >,
-    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, RAJA::cuda_block_x_loop, RAJA::cuda_thread_x_loop, RAJA::seq_exec >
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::cuda_block_x_unchecked, RAJA::cuda_thread_y_loop >,
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, RAJA::cuda_block_x_direct, RAJA::cuda_thread_x_loop, RAJA::seq_exec >
   >;
 
 #endif  // RAJA_ENABLE_CUDA
@@ -71,8 +71,8 @@ using HipKernelNestedLoopExecPols = camp::list<
 
     // Device Depth 3 ReduceSum Exec Pols
     NestedLoopData<DEVICE_DEPTH_3_REDUCESUM, RAJA::hip_block_x_loop, RAJA::hip_thread_y_loop, RAJA::hip_thread_z_loop >,
-    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::hip_block_x_loop, RAJA::hip_thread_y_loop >,
-    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, RAJA::hip_block_x_loop, RAJA::hip_thread_x_loop, RAJA::seq_exec >
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::hip_block_x_unchecked, RAJA::hip_thread_y_loop >,
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, RAJA::hip_block_x_direct, RAJA::hip_thread_x_loop, RAJA::seq_exec >
   >;
 
 #endif  // RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in b/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
index 1ec35fb47e..9e620aea4b 100644
--- a/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
+++ b/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
@@ -83,7 +83,7 @@ using CudaKernelExecPols = camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::CudaKernelAsync<
       RAJA::statement::For<0, RAJA::cuda_block_z_loop,
-        RAJA::statement::For<1, RAJA::cuda_block_y_loop,
+        RAJA::statement::For<1, RAJA::cuda_block_y_unchecked,
           RAJA::statement::For<2, RAJA::cuda_thread_x_loop,
             RAJA::statement::Lambda<0, RAJA::Segs<0, 1, 2>>
           >
@@ -113,7 +113,7 @@ using HipKernelExecPols = camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::HipKernelAsync<
       RAJA::statement::For<0, RAJA::hip_block_z_loop,
-        RAJA::statement::For<1, RAJA::hip_block_y_loop,
+        RAJA::statement::For<1, RAJA::hip_block_y_unchecked,
           RAJA::statement::For<2, RAJA::hip_thread_x_loop,
             RAJA::statement::Lambda<0, RAJA::Segs<0, 1, 2>>
           >
diff --git a/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in b/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
index 339439cc85..601ddfafe7 100644
--- a/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
+++ b/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
@@ -145,7 +145,7 @@ camp::list<
       RAJA::statement::Tile<0, RAJA::tile_fixed<8>,
                                RAJA::cuda_block_y_direct,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::cuda_block_x_direct,
+                                 RAJA::cuda_block_x_unchecked,
           RAJA::statement::For<0, RAJA::cuda_thread_y_direct,   // outer
             RAJA::statement::For<1, RAJA::cuda_thread_x_direct, // inner
               RAJA::statement::Lambda<0>
@@ -176,11 +176,11 @@ camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::CudaKernelFixed<4*8*8,
       RAJA::statement::Tile<0, RAJA::tile_fixed<4>,
-                               RAJA::cuda_block_z_direct,
+                               RAJA::cuda_block_z_unchecked,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::cuda_block_y_direct,
+                                 RAJA::cuda_block_y_unchecked,
           RAJA::statement::Tile<2, RAJA::tile_fixed<8>,
-                                   RAJA::cuda_block_x_direct,
+                                   RAJA::cuda_block_x_unchecked,
             RAJA::statement::For<0, RAJA::cuda_thread_z_direct,     // outer
               RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // middle
                 RAJA::statement::For<2, RAJA::cuda_thread_x_direct, // inner
@@ -216,7 +216,7 @@ camp::list<
       RAJA::statement::Tile<0, RAJA::tile_fixed<8>,
                                RAJA::hip_block_y_direct,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::hip_block_x_direct,
+                                 RAJA::hip_block_x_unchecked,
           RAJA::statement::For<0, RAJA::hip_thread_y_direct,   // outer
             RAJA::statement::For<1, RAJA::hip_thread_x_direct, // inner
               RAJA::statement::Lambda<0>
@@ -247,11 +247,11 @@ camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::HipKernelFixed<4*8*8,
       RAJA::statement::Tile<0, RAJA::tile_fixed<4>,
-                               RAJA::hip_block_z_direct,
+                               RAJA::hip_block_z_unchecked,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::hip_block_y_direct,
+                                 RAJA::hip_block_y_unchecked,
           RAJA::statement::Tile<2, RAJA::tile_fixed<8>,
-                                   RAJA::hip_block_x_direct,
+                                   RAJA::hip_block_x_unchecked,
             RAJA::statement::For<0, RAJA::hip_thread_z_direct,     // outer
               RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // middle
                 RAJA::statement::For<2, RAJA::hip_thread_x_direct, // inner
diff --git a/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in b/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
index 5999101f95..d0e3166583 100644
--- a/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
+++ b/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
@@ -73,12 +73,12 @@ using CudaKernelNestedLoopExecPols = camp::list<
     // Depth 2 Exec Pols
     NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_thread_x_loop, RAJA::cuda_thread_y_loop >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::seq_exec, RAJA::cuda_thread_x_loop >,
-    NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_thread_x_loop, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_block_y_unchecked, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_global_size_x_direct<32>, RAJA::cuda_global_size_y_loop<8> >,
 
     // Depth 3 Exec Pols
-    NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_thread_x_loop, RAJA::cuda_thread_y_loop, RAJA::seq_exec >,
-    NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_block_x_loop, RAJA::cuda_thread_x_loop, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_thread_x_loop, RAJA::cuda_block_x_unchecked, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_block_x_loop, RAJA::cuda_thread_y_loop, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_global_size_x_loop<16>, RAJA::cuda_global_size_y_direct<16>, RAJA::seq_exec >
   >;
 
@@ -91,12 +91,12 @@ using HipKernelNestedLoopExecPols = camp::list<
     // Depth 2 Exec Pols
     NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_thread_x_loop, RAJA::hip_thread_y_loop >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::seq_exec, RAJA::hip_thread_x_loop >,
-    NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_thread_x_loop, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_block_y_unchecked, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_global_size_x_direct<64>, RAJA::hip_global_size_y_loop<4> >,
 
     // Depth 3 Exec Pols
-    NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_thread_x_loop, RAJA::hip_thread_y_loop, RAJA::seq_exec >,
-    NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_block_x_loop, RAJA::hip_thread_x_loop, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_thread_x_loop, RAJA::hip_block_x_unchecked, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_block_x_loop, RAJA::hip_thread_y_loop, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_global_size_x_loop<32>, RAJA::hip_global_size_y_direct<8>, RAJA::seq_exec >
   >;
 
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt b/test/functional/kernel/tile-icount-tcount-direct/CMakeLists.txt
similarity index 74%
rename from test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt
rename to test/functional/kernel/tile-icount-tcount-direct/CMakeLists.txt
index b6d02f9da6..864c207419 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt
+++ b/test/functional/kernel/tile-icount-tcount-direct/CMakeLists.txt
@@ -29,11 +29,11 @@ endif()
 foreach( BACKEND ${KERNEL_BACKENDS} )
   foreach( TESTTYPE ${TESTTYPES} )
     foreach( TILESIZE ${TILESIZES} )
-      configure_file( test-kernel-single-loop-tile-count.cpp.in
-                      test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp )
-      raja_add_test( NAME test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}
-                    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp )
-      target_include_directories(test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.exe
+      configure_file( test-kernel-tile-count-direct.cpp.in
+                      test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct.cpp )
+      raja_add_test( NAME test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct
+                    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct.cpp )
+      target_include_directories(test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct.exe
                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
     endforeach()
   endforeach()
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in b/test/functional/kernel/tile-icount-tcount-direct/test-kernel-tile-count-direct.cpp.in
similarity index 93%
rename from test/functional/kernel/single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in
rename to test/functional/kernel/tile-icount-tcount-direct/test-kernel-tile-count-direct.cpp.in
index bd067b32c9..0965dae10a 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in
+++ b/test/functional/kernel/tile-icount-tcount-direct/test-kernel-tile-count-direct.cpp.in
@@ -18,7 +18,7 @@
 // Header for tests in ./tests directory
 //
 // Note: CMake adds ./tests as an include dir for these tests.
-#include "test-kernel-single-loop-@TESTTYPE@.hpp"
+#include "test-kernel-tile-@TESTTYPE@-direct.hpp"
 
 //
 // Note that a separate test file/executable is generated for each tile size
@@ -134,7 +134,7 @@ using CudaKernelForICountExecPols = camp::list<
 
   RAJA::KernelPolicy< 
     RAJA::statement::CudaKernel<
-      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_loop,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_direct,
         RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, 
           RAJA::statement::Lambda<0>
         >
@@ -148,7 +148,7 @@ using CudaKernelTileTCountExecPols = camp::list<
 
   RAJA::KernelPolicy<
     RAJA::statement::CudaKernel<
-      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_loop,
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_direct,
         RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
           RAJA::statement::Lambda<0>
         >
@@ -169,7 +169,7 @@ using HipKernelForICountExecPols = camp::list<
 
   RAJA::KernelPolicy< 
     RAJA::statement::HipKernel<
-      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_loop,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_direct,
         RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct, 
           RAJA::statement::Lambda<0>
         >
@@ -183,7 +183,7 @@ using HipKernelTileTCountExecPols = camp::list<
 
   RAJA::KernelPolicy<
     RAJA::statement::HipKernel<
-      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_loop,
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_direct,
         RAJA::statement::For<0, RAJA::hip_thread_x_direct,
           RAJA::statement::Lambda<0>
         >
@@ -204,7 +204,7 @@ using SyclKernelForICountExecPols = camp::list<
 
   RAJA::KernelPolicy<
     RAJA::statement::SyclKernel<
-      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_loop,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_direct,
         RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::sycl_local_0_direct,
           RAJA::statement::Lambda<0>
         >
@@ -218,7 +218,7 @@ using SyclKernelTileTCountExecPols = camp::list<
 
   RAJA::KernelPolicy<
     RAJA::statement::SyclKernel<
-      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_loop,
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_direct,
         RAJA::statement::For<0, RAJA::sycl_local_0_direct,
           RAJA::statement::Lambda<0>
         >
@@ -232,7 +232,7 @@ using SyclKernelTileTCountExecPols = camp::list<
 //
 // Cartesian product of types used in parameterized tests
 //
-using @BACKEND@KernelSingleLoop@TESTTYPE@Types =
+using @BACKEND@KernelTile@TESTTYPE@Types =
   Test< camp::cartesian_product<IdxTypeList,
                                 @BACKEND@Kernel@TESTTYPE@ExecPols,
                                 @BACKEND@ReducePols,
@@ -242,5 +242,5 @@ using @BACKEND@KernelSingleLoop@TESTTYPE@Types =
 // Instantiate parameterized tests
 //
 INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
-                               KernelSingleLoop@TESTTYPE@Test,
-                               @BACKEND@KernelSingleLoop@TESTTYPE@Types);
+                               KernelTile@TESTTYPE@DirectTest,
+                               @BACKEND@KernelTile@TESTTYPE@Types);
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/tile-icount-tcount-direct/tests/test-kernel-tile-ForICount-direct.hpp
similarity index 73%
rename from test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
rename to test/functional/kernel/tile-icount-tcount-direct/tests/test-kernel-tile-ForICount-direct.hpp
index 82e749d226..57b553ea9e 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/tile-icount-tcount-direct/tests/test-kernel-tile-ForICount-direct.hpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
-#define __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
+#ifndef __TEST_KERNEL_TILE_FORICOUNT_DIRECT_HPP__
+#define __TEST_KERNEL_TILE_FORICOUNT_DIRECT_HPP__
 
 //
 // Value struct for manipulating tile sizes in parameterized tests.
@@ -18,7 +18,7 @@ struct Value {
 
 
 template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
-void KernelSingleLoopForICountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+void KernelTileForICountDirectTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 {
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
@@ -55,14 +55,14 @@ void KernelSingleLoopForICountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 }
 
 
-TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest);
+TYPED_TEST_SUITE_P(KernelTileForICountDirectTest);
 template <typename T>
-class KernelSingleLoopForICountTest : public ::testing::Test
+class KernelTileForICountDirectTest : public ::testing::Test
 {
 };
 
 
-TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
+TYPED_TEST_P(KernelTileForICountDirectTest, ForICountTileKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -70,14 +70,14 @@ TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
-  KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileForICountDirectTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(57), tsize);
-  KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileForICountDirectTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(1035), tsize);
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest,
-                            ForICountSingleLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileForICountDirectTest,
+                            ForICountTileKernel);
 
-#endif  // __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
+#endif  // __TEST_KERNEL_TILE_FORICOUNT_DIRECT_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/tile-icount-tcount-direct/tests/test-kernel-tile-TileTCount-direct.hpp
similarity index 73%
rename from test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
rename to test/functional/kernel/tile-icount-tcount-direct/tests/test-kernel-tile-TileTCount-direct.hpp
index e745a8d08b..6b637fb2d5 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/tile-icount-tcount-direct/tests/test-kernel-tile-TileTCount-direct.hpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
-#define __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
+#ifndef __TEST_KERNEL_TILE_TILETCOUNT_DIRECT_HPP_
+#define __TEST_KERNEL_TILE_TILETCOUNT_DIRECT_HPP_
 
 //
 // Value struct for manipulating tile sizes in parameterized tests.
@@ -18,7 +18,7 @@ struct Value {
 
 
 template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
-void KernelSingleLoopTileTCountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+void KernelTileTileTCountDirectTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 {
 
   IDX_TYPE NT = (N + tsize - 1) / tsize;
@@ -57,14 +57,14 @@ void KernelSingleLoopTileTCountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 }
 
 
-TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest);
+TYPED_TEST_SUITE_P(KernelTileTileTCountDirectTest);
 template <typename T>
-class KernelSingleLoopTileTCountTest : public ::testing::Test
+class KernelTileTileTCountDirectTest : public ::testing::Test
 {
 };
 
 
-TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
+TYPED_TEST_P(KernelTileTileTCountDirectTest, TileTCountTileKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -72,14 +72,14 @@ TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
-  KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileTileTCountDirectTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(57), tsize);
-  KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileTileTCountDirectTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(1035), tsize);
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest,
-                            TileTCountSingleLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileTileTCountDirectTest,
+                            TileTCountTileKernel);
 
-#endif  // __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
+#endif  // __TEST_KERNEL_TILE_TILETCOUNT_DIRECT_HPP_
diff --git a/test/functional/kernel/tile-icount-tcount-loop/CMakeLists.txt b/test/functional/kernel/tile-icount-tcount-loop/CMakeLists.txt
new file mode 100644
index 0000000000..7544ad7d77
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-loop/CMakeLists.txt
@@ -0,0 +1,36 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of test types for future expansion, if needed. 
+#
+set(TESTTYPES ForICount TileTCount)
+set(TILESIZES 8 32)
+
+#
+# Generate tests for each enabled RAJA back-end. 
+# 
+# Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${KERNEL_BACKENDS} )
+  # using omp target crashes the compiler with this one 
+  if( NOT ((BACKEND STREQUAL "OpenMPTarget")) )
+    foreach( TESTTYPE ${TESTTYPES} )
+      foreach( TILESIZE ${TILESIZES} )
+        configure_file( test-kernel-tile-count-loop.cpp.in
+                        test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-loop.cpp )
+        raja_add_test( NAME test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-loop
+                      SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-loop.cpp )
+        target_include_directories(test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-loop.exe
+                                  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+      endforeach()
+    endforeach()
+  endif()
+endforeach()
+
+unset( TILESIZES )
+unset( TESTTYPES )
diff --git a/test/functional/kernel/tile-icount-tcount-loop/test-kernel-tile-count-loop.cpp.in b/test/functional/kernel/tile-icount-tcount-loop/test-kernel-tile-count-loop.cpp.in
new file mode 100644
index 0000000000..c1bf5f9489
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-loop/test-kernel-tile-count-loop.cpp.in
@@ -0,0 +1,246 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+#include "test-kernel-tile-@TESTTYPE@-loop.hpp"
+
+//
+// Note that a separate test file/executable is generated for each tile size
+// defined via CMake variable 'TILESIZE' defined in CMakeLists.txt file. 
+// The reason for doing it this way is that the tests require the tile size
+// in the exec policy must match the tile size value defined here. Defining
+// multiple tile sizes in a list here and using that to define a cartesian
+// product of test cases would break that assumpiton.
+//
+// Tile size value must match that used in exec policy.
+//
+using TileSizes = camp::list< Value<@TILESIZE@> >;
+
+//
+// Num reduction policies must match num exec policies.
+//
+using SequentialReducePols = camp::list< RAJA::seq_reduce >; 
+
+// Sequential execution policy types
+using SequentialKernelForICountExecPols = camp::list< 
+    
+  RAJA::KernelPolicy<
+    RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::seq_exec,
+      RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >
+
+>;
+
+using SequentialKernelTileTCountExecPols = camp::list<
+   
+  RAJA::KernelPolicy<
+    RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >
+
+>;
+
+#if defined(RAJA_ENABLE_OPENMP)
+//
+// Num reduction policies must match num exec policies.
+//
+using OpenMPReducePols = camp::list< RAJA::omp_reduce >; 
+
+using OpenMPKernelForICountExecPols = camp::list< 
+    
+  RAJA::KernelPolicy<
+    RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::seq_exec,
+      RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::omp_parallel_for_exec,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >
+
+>;
+
+using OpenMPKernelTileTCountExecPols = camp::list< 
+    
+  RAJA::KernelPolicy<
+    RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >
+   
+>;
+#endif  // if defined(RAJA_ENABLE_OPENMP)
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+//
+// Num reduction policies must match num exec policies.
+//
+using OpenMPTargetReducePols = camp::list< RAJA::omp_target_reduce >; 
+
+using OpenMPTargetKernelForICountExecPols = camp::list< 
+
+  RAJA::KernelPolicy<
+    RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::omp_target_parallel_for_exec_nt,
+      RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >
+
+>;
+
+using OpenMPTargetKernelTileTCountExecPols = camp::list< 
+
+  RAJA::KernelPolicy<
+    RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::omp_target_parallel_for_exec_nt,
+      RAJA::statement::For<0, RAJA::seq_exec,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >
+   
+>;
+#endif  // if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#if defined(RAJA_ENABLE_CUDA)
+//
+// Num reduction policies must match num exec policies.
+//
+using CudaReducePols = camp::list< RAJA::cuda_reduce >; 
+
+using CudaKernelForICountExecPols = camp::list< 
+
+  RAJA::KernelPolicy< 
+    RAJA::statement::CudaKernel<
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_loop,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_x_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+
+using CudaKernelTileTCountExecPols = camp::list< 
+
+  RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_loop,
+        RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_CUDA)
+
+#if defined(RAJA_ENABLE_HIP)
+//
+// Num reduction policies must match num exec policies.
+//
+using HipReducePols = camp::list< RAJA::hip_reduce >; 
+
+using HipKernelForICountExecPols = camp::list<
+
+  RAJA::KernelPolicy< 
+    RAJA::statement::HipKernel<
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_loop,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::hip_thread_x_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+
+using HipKernelTileTCountExecPols = camp::list< 
+
+  RAJA::KernelPolicy<
+    RAJA::statement::HipKernel<
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_loop,
+        RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  > 
+
+>;
+#endif  // if defined(RAJA_ENABLE_HIP)
+
+#if defined(RAJA_ENABLE_SYCL)
+//
+// Num reduction policies must match num exec policies.
+//
+using SyclReducePols = camp::list< RAJA::sycl_reduce >;
+
+using SyclKernelForICountExecPols = camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_loop,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::sycl_local_0_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+
+using SyclKernelTileTCountExecPols = camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_loop,
+        RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_SYCL)
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@KernelTile@TESTTYPE@Types =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@Kernel@TESTTYPE@ExecPols,
+                                @BACKEND@ReducePols,
+                                TileSizes> >::Types;
+
+//
+// Instantiate parameterized tests
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               KernelTile@TESTTYPE@LoopTest,
+                               @BACKEND@KernelTile@TESTTYPE@Types);
diff --git a/test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-ForICount-loop.hpp b/test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-ForICount-loop.hpp
new file mode 100644
index 0000000000..2d94ecf729
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-ForICount-loop.hpp
@@ -0,0 +1,83 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_TILE_FORICOUNT_LOOP_HPP__
+#define __TEST_KERNEL_TILE_FORICOUNT_LOOP_HPP__
+
+//
+// Value struct for manipulating tile sizes in parameterized tests.
+//
+template<int VALUE>
+struct Value {
+  static constexpr int value = VALUE;
+};
+
+
+template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
+void KernelTileForICountLoopTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+{
+
+  RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
+
+  for (IDX_TYPE t = 0; t < tsize; ++t) {
+
+    RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
+
+    RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
+      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
+        trip_count += 1;
+        if ( i % tsize == t && ii == t ) { 
+          tile_count += 1;
+        }
+      }
+    );
+
+    IDX_TYPE trip_result = trip_count.get();
+    ASSERT_EQ( trip_result, (t+1) * N );
+
+    IDX_TYPE tile_result = tile_count.get();
+
+    IDX_TYPE tile_expect = N / tsize;
+    if ( t < N % tsize ) {
+      tile_expect += 1;
+    }
+    ASSERT_EQ(tile_result, tile_expect);
+
+  }
+
+}
+
+
+TYPED_TEST_SUITE_P(KernelTileForICountLoopTest);
+template <typename T>
+class KernelTileForICountLoopTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(KernelTileForICountLoopTest, ForICountTileKernel)
+{
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
+
+  KernelTileForICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(57), tsize);
+  KernelTileForICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(1035), tsize);
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelTileForICountLoopTest,
+                            ForICountTileKernel);
+
+#endif  // __TEST_KERNEL_TILE_FORICOUNT_LOOP_HPP__
diff --git a/test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-TileTCount-loop.hpp b/test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-TileTCount-loop.hpp
new file mode 100644
index 0000000000..dd3601238e
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-loop/tests/test-kernel-tile-TileTCount-loop.hpp
@@ -0,0 +1,85 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_TILE_TILETCOUNT_LOOP_HPP_
+#define __TEST_KERNEL_TILE_TILETCOUNT_LOOP_HPP_
+
+//
+// Value struct for manipulating tile sizes in parameterized tests.
+//
+template<int VALUE>
+struct Value {
+  static constexpr int value = VALUE;
+};
+
+
+template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
+void KernelTileTileTCountLoopTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+{
+
+  IDX_TYPE NT = (N + tsize - 1) / tsize;
+
+  RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
+
+  for (IDX_TYPE t = 0; t < NT; ++t) {
+
+    RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
+
+    RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
+      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
+        trip_count += 1;
+        if ( i / tsize == t && ti == t ) {
+          tile_count += 1;
+        }
+      }
+    );
+
+    IDX_TYPE trip_result = trip_count.get();
+    ASSERT_EQ( trip_result, (t+1) * N );
+
+    IDX_TYPE tile_result = tile_count.get();
+
+    IDX_TYPE tile_expect = tsize;
+    if ( (t + 1) * tsize > N ) {
+      tile_expect = N - t * tsize;
+    }
+    ASSERT_EQ(tile_result, tile_expect);
+
+  }
+
+}
+
+
+TYPED_TEST_SUITE_P(KernelTileTileTCountLoopTest);
+template <typename T>
+class KernelTileTileTCountLoopTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(KernelTileTileTCountLoopTest, TileTCountTileKernel)
+{
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
+
+  KernelTileTileTCountLoopTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(57), tsize);
+  KernelTileTileTCountLoopTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(1035), tsize);
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelTileTileTCountLoopTest,
+                            TileTCountTileKernel);
+
+#endif  // __TEST_KERNEL_TILE_TILETCOUNT_LOOP_HPP_
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt b/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt
new file mode 100644
index 0000000000..be28532b6e
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt
@@ -0,0 +1,36 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of test types for future expansion, if needed. 
+#
+set(TESTTYPES ForICount TileTCount)
+set(TILESIZES 8 32)
+
+#
+# Generate tests for each enabled RAJA back-end. 
+# 
+# Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${KERNEL_UNCHECKED_BACKENDS} )
+  # using omp target crashes the compiler with this one 
+  if( NOT ((BACKEND STREQUAL "OpenMPTarget")) )
+    foreach( TESTTYPE ${TESTTYPES} )
+      foreach( TILESIZE ${TILESIZES} )
+        configure_file( test-kernel-tile-count-unchecked.cpp.in
+                        test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked.cpp )
+        raja_add_test( NAME test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked
+                      SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked.cpp )
+        target_include_directories(test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked.exe
+                                  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+      endforeach()
+    endforeach()
+  endif()
+endforeach()
+
+unset( TILESIZES )
+unset( TESTTYPES )
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in b/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in
new file mode 100644
index 0000000000..fe526aa5a6
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in
@@ -0,0 +1,119 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+#include "test-kernel-tile-@TESTTYPE@-unchecked.hpp"
+
+//
+// Note that a separate test file/executable is generated for each tile size
+// defined via CMake variable 'TILESIZE' defined in CMakeLists.txt file. 
+// The reason for doing it this way is that the tests require the tile size
+// in the exec policy must match the tile size value defined here. Defining
+// multiple tile sizes in a list here and using that to define a cartesian
+// product of test cases would break that assumpiton.
+//
+// Tile size value must match that used in exec policy.
+//
+using TileSizes = camp::list< Value<@TILESIZE@> >;
+
+#if defined(RAJA_ENABLE_CUDA)
+//
+// Num reduction policies must match num exec policies.
+//
+using CudaReducePols = camp::list< RAJA::cuda_reduce >; 
+
+using CudaKernelForICountExecPols = camp::list< 
+
+  RAJA::KernelPolicy< 
+    RAJA::statement::CudaKernel<
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_unchecked,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_x_unchecked,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+
+using CudaKernelTileTCountExecPols = camp::list< 
+
+  RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_unchecked,
+        RAJA::statement::For<0, RAJA::cuda_thread_x_unchecked,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_CUDA)
+
+#if defined(RAJA_ENABLE_HIP)
+//
+// Num reduction policies must match num exec policies.
+//
+using HipReducePols = camp::list< RAJA::hip_reduce >; 
+
+using HipKernelForICountExecPols = camp::list<
+
+  RAJA::KernelPolicy< 
+    RAJA::statement::HipKernel<
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_unchecked,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::hip_thread_x_unchecked,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+
+using HipKernelTileTCountExecPols = camp::list< 
+
+  RAJA::KernelPolicy<
+    RAJA::statement::HipKernel<
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_unchecked,
+        RAJA::statement::For<0, RAJA::hip_thread_x_unchecked,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  > 
+
+>;
+#endif  // if defined(RAJA_ENABLE_HIP)
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@KernelTile@TESTTYPE@Types =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@Kernel@TESTTYPE@ExecPols,
+                                @BACKEND@ReducePols,
+                                TileSizes> >::Types;
+
+//
+// Instantiate parameterized tests
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               KernelTile@TESTTYPE@UncheckedTest,
+                               @BACKEND@KernelTile@TESTTYPE@Types);
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp
new file mode 100644
index 0000000000..1a831c3f12
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp
@@ -0,0 +1,85 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_TILE_FORICOUNT_UNCHECKED_HPP__
+#define __TEST_KERNEL_TILE_FORICOUNT_UNCHECKED_HPP__
+
+//
+// Value struct for manipulating tile sizes in parameterized tests.
+//
+template<int VALUE>
+struct Value {
+  static constexpr int value = VALUE;
+};
+
+
+template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
+void KernelTileForICountUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+{
+
+  RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
+
+  for (IDX_TYPE t = 0; t < tsize; ++t) {
+
+    RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
+
+    RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
+      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
+        trip_count += 1;
+        if ( i % tsize == t && ii == t ) { 
+          tile_count += 1;
+        }
+      }
+    );
+
+    IDX_TYPE trip_result = trip_count.get();
+    ASSERT_EQ( trip_result, (t+1) * N );
+
+    IDX_TYPE tile_result = tile_count.get();
+
+    IDX_TYPE tile_expect = N / tsize;
+    if ( t < N % tsize ) {
+      tile_expect += 1;
+    }
+    ASSERT_EQ(tile_result, tile_expect);
+
+  }
+
+}
+
+
+TYPED_TEST_SUITE_P(KernelTileForICountUncheckedTest);
+template <typename T>
+class KernelTileForICountUncheckedTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(KernelTileForICountUncheckedTest, ForICountTileKernel)
+{
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
+
+  KernelTileForICountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(0), tsize);
+  KernelTileForICountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(tsize), tsize);
+  KernelTileForICountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(13*tsize), tsize);
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelTileForICountUncheckedTest,
+                            ForICountTileKernel);
+
+#endif  // __TEST_KERNEL_TILE_FORICOUNT_UNCHECKED_HPP__
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp
new file mode 100644
index 0000000000..ef56efd788
--- /dev/null
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp
@@ -0,0 +1,87 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_TILE_TILETCOUNT_UNCHECKED_HPP_
+#define __TEST_KERNEL_TILE_TILETCOUNT_UNCHECKED_HPP_
+
+//
+// Value struct for manipulating tile sizes in parameterized tests.
+//
+template<int VALUE>
+struct Value {
+  static constexpr int value = VALUE;
+};
+
+
+template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
+void KernelTileTileTCountUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+{
+
+  IDX_TYPE NT = (N + tsize - 1) / tsize;
+
+  RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
+
+  for (IDX_TYPE t = 0; t < NT; ++t) {
+
+    RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
+
+    RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
+      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
+        trip_count += 1;
+        if ( i / tsize == t && ti == t ) {
+          tile_count += 1;
+        }
+      }
+    );
+
+    IDX_TYPE trip_result = trip_count.get();
+    ASSERT_EQ( trip_result, (t+1) * N );
+
+    IDX_TYPE tile_result = tile_count.get();
+
+    IDX_TYPE tile_expect = tsize;
+    if ( (t + 1) * tsize > N ) {
+      tile_expect = N - t * tsize;
+    }
+    ASSERT_EQ(tile_result, tile_expect);
+
+  }
+
+}
+
+
+TYPED_TEST_SUITE_P(KernelTileTileTCountUncheckedTest);
+template <typename T>
+class KernelTileTileTCountUncheckedTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(KernelTileTileTCountUncheckedTest, TileTCountTileKernel)
+{
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
+
+  KernelTileTileTCountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(0), tsize);
+  KernelTileTileTCountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(tsize), tsize);
+  KernelTileTileTCountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+    IDX_TYPE(13*tsize), tsize);
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelTileTileTCountUncheckedTest,
+                            TileTCountTileKernel);
+
+#endif  // __TEST_KERNEL_TILE_TILETCOUNT_UNCHECKED_HPP_
diff --git a/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in
index 689a218002..401561065c 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in
@@ -147,7 +147,7 @@ using OpenMPTargetKernelTileExecPols =
 //    RAJA::KernelPolicy<
 //      RAJA::statement::CudaKernel<
 //        RAJA::statement::Tile<1, RAJA::tile_dynamic<1>, RAJA::seq_exec,
-//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::cuda_block_x_loop,
+//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::cuda_block_x_unchecked,
 //            RAJA::statement::For<1, RAJA::seq_exec,
 //              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
 //                RAJA::statement::Lambda<0, RAJA::Segs<0,1>, RAJA::Params<>>
@@ -198,7 +198,7 @@ using OpenMPTargetKernelTileExecPols =
 //    RAJA::KernelPolicy<
 //      RAJA::statement::HipKernel<
 //        RAJA::statement::Tile<1, RAJA::tile_dynamic<1>, RAJA::seq_exec,
-//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::hip_block_x_loop,
+//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::hip_block_x_unchecked,
 //            RAJA::statement::For<1, RAJA::seq_exec,
 //              RAJA::statement::For<0, RAJA::hip_thread_x_loop,
 //                RAJA::statement::Lambda<0, RAJA::Segs<0,1>, RAJA::Params<>>
diff --git a/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
index 668943051a..9204ab4548 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
@@ -147,7 +147,7 @@ using CudaKernelTileExecPols =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
         RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::seq_exec,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_x_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_x_unchecked,
             RAJA::statement::For<1, RAJA::seq_exec,
               RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
                 RAJA::statement::Lambda<0>
@@ -198,7 +198,7 @@ using HipKernelTileExecPols =
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
         RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::seq_exec,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_x_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_x_unchecked,
             RAJA::statement::For<1, RAJA::seq_exec,
               RAJA::statement::For<0, RAJA::hip_thread_x_loop,
                 RAJA::statement::Lambda<0>
diff --git a/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
index e4917a997e..b2a1b6a9bd 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
@@ -35,7 +35,9 @@ using SequentialKernelTileExecPols =
     RAJA::KernelPolicy<
       RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::seq_exec,
         RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::seq_exec,
+
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+
             RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
               RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
                 RAJA::statement::Lambda<0>
@@ -62,7 +64,9 @@ using OpenMPKernelTileExecPols =
     RAJA::KernelPolicy<
       RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::omp_parallel_for_exec,
         RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::seq_exec,
+
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+
             RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
               RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
                 RAJA::statement::Lambda<0>
@@ -91,9 +95,11 @@ using CudaKernelTileExecPols =
 
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::cuda_block_x_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_y_direct,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::cuda_block_x_unchecked,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_y_unchecked,
+
             RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
+
               RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_loop,
                 RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
                   RAJA::statement::Lambda<0>
@@ -126,9 +132,11 @@ using HipKernelTileExecPols =
 
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::hip_block_x_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_y_direct,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::hip_block_x_unchecked,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_y_unchecked,
+
             RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
+
               RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_loop,
                 RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
                   RAJA::statement::Lambda<0>
@@ -163,7 +171,9 @@ using HipKernelTileExecPols =
       RAJA::statement::SyclKernel<
         RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::sycl_group_0_loop,
           RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::sycl_group_1_direct,
+
             RAJA::statement::InitLocalMem<RAJA::sycl_shared_mem, RAJA::ParamList<2>,
+
               RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::sycl_local_0_loop,
                 RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::sycl_local_1_direct,
                   RAJA::statement::Lambda<0>
diff --git a/test/functional/launch/CMakeLists.txt b/test/functional/launch/CMakeLists.txt
index a8fcdfd8ce..25a0bf9379 100644
--- a/test/functional/launch/CMakeLists.txt
+++ b/test/functional/launch/CMakeLists.txt
@@ -6,6 +6,7 @@
 ###############################################################################
 
 list(APPEND LAUNCH_BACKENDS Sequential)
+set(LAUNCH_UNCHECKED_BACKENDS "")
 
 if(RAJA_ENABLE_OPENMP)
   list(APPEND LAUNCH_BACKENDS OpenMP)
@@ -13,10 +14,12 @@ endif()
 
 if(RAJA_ENABLE_CUDA)
   list(APPEND LAUNCH_BACKENDS Cuda)
+  list(APPEND LAUNCH_UNCHECKED_BACKENDS Cuda)
 endif()
 
 if(RAJA_ENABLE_HIP)
   list(APPEND LAUNCH_BACKENDS Hip)
+  list(APPEND LAUNCH_UNCHECKED_BACKENDS Hip)
 endif()
 
 if(RAJA_ENABLE_SYCL)
@@ -36,16 +39,24 @@ add_subdirectory(segment)
 
 add_subdirectory(shared_mem)
 
-add_subdirectory(nested_loop)
+add_subdirectory(nested_unchecked)
 
 add_subdirectory(nested_direct)
 
-add_subdirectory(tile_icount_direct)
+add_subdirectory(nested_loop)
+
+add_subdirectory(tile_icount_tcount_unchecked)
 
-add_subdirectory(tile_icount_loop)
+add_subdirectory(tile_icount_tcount_direct)
+
+add_subdirectory(tile_icount_tcount_loop)
+
+add_subdirectory(nested_tile_unchecked)
 
 add_subdirectory(nested_tile_direct)
 
 add_subdirectory(nested_tile_loop)
 
 unset( LAUNCH_BACKENDS )
+unset( LAUNCH_UNCHECKED_BACKENDS )
+
diff --git a/test/functional/launch/nested_tile_unchecked/CMakeLists.txt b/test/functional/launch/nested_tile_unchecked/CMakeLists.txt
new file mode 100644
index 0000000000..6152dfa2ad
--- /dev/null
+++ b/test/functional/launch/nested_tile_unchecked/CMakeLists.txt
@@ -0,0 +1,22 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+#
+
+foreach( BACKEND ${LAUNCH_UNCHECKED_BACKENDS} )
+    configure_file( test-launch-nested-tile-unchecked.cpp.in
+                    test-launch-nested-Tile-Unchecked-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-Tile-Unchecked-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-Unchecked-${BACKEND}.cpp )
+
+    target_include_directories(test-launch-nested-Tile-Unchecked-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
diff --git a/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in b/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in
new file mode 100644
index 0000000000..85595ac970
--- /dev/null
+++ b/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-launch-nested-Tile-Unchecked.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@LaunchNestedTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@_launch_policies>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               LaunchNestedTileUncheckedTest,
+                               @BACKEND@LaunchNestedTypes);
diff --git a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
new file mode 100644
index 0000000000..4702473b5e
--- /dev/null
+++ b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
@@ -0,0 +1,145 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_LAUNCH_NESTED_TILE_UNCHECKED_HPP__
+#define __TEST_LAUNCH_NESTED_TILE_UNCHECKED_HPP__
+
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
+{
+
+  const int tile_size_x = 2;
+  const int tile_size_y = 3;
+  const int tile_size_z = 4;
+
+  const int threads_x = tile_size_x;
+  const int threads_y = tile_size_y;
+  const int threads_z = tile_size_z;
+
+  const int blocks_x = 4*M;
+  const int blocks_y = 5*M;
+  const int blocks_z = 6*M;
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x*blocks_x);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y*blocks_y);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z*blocks_z);
+
+  INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
+  INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
+  INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
+
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
+                                         N2 *
+                                         N3);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  size_t data_len = RAJA::stripIndexType(N);
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
+
+  constexpr int DIM = 3;
+  using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+  RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
+
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+      RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+          RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+              RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+
+                  RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
+                      RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
+                          RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
+
+                              auto idx = tx + N1 * (ty + N2 * tz);
+
+                              Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+
+                            });
+                        });
+                    });
+
+                });
+            });
+        });
+  });
+
+  if ( RAJA::stripIndexType(N) > 0 ) {
+
+    working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+
+  }
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest);
+template <typename T>
+class LaunchNestedTileUncheckedTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
+{
+
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+
+
+  // test zero-length range segment
+  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(0));
+
+  //Keep at one since we are doing a unchecked thread test
+  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(1));
+
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest,
+                            RangeSegmentTeams);
+
+#endif  // __TEST_LAUNCH_NESTED_TILE_UNCHECKED_HPP__
diff --git a/test/functional/launch/nested_unchecked/CMakeLists.txt b/test/functional/launch/nested_unchecked/CMakeLists.txt
new file mode 100644
index 0000000000..cb67616db9
--- /dev/null
+++ b/test/functional/launch/nested_unchecked/CMakeLists.txt
@@ -0,0 +1,30 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(NESTEDTYPES Unchecked)
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+#
+
+foreach( BACKEND ${LAUNCH_UNCHECKED_BACKENDS} )
+  foreach( NESTEDTYPES ${NESTEDTYPES} )
+    configure_file( test-launch-nested.cpp.in
+                    test-launch-nested-${NESTEDTYPES}-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-${NESTEDTYPES}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-${NESTEDTYPES}-${BACKEND}.cpp )
+
+    target_include_directories(test-launch-nested-${NESTEDTYPES}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( NESTEDTYPES )
diff --git a/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in b/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in
new file mode 100644
index 0000000000..08ec672089
--- /dev/null
+++ b/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-launch-nested-@NESTEDTYPES@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@LaunchNestedTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@_launch_policies>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               LaunchNested@NESTEDTYPES@Test,
+                               @BACKEND@LaunchNestedTypes);
diff --git a/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp b/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
new file mode 100644
index 0000000000..f3b05b31c7
--- /dev/null
+++ b/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
@@ -0,0 +1,150 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_LAUNCH_NESTED_UNCHECKED_HPP__
+#define __TEST_LAUNCH_NESTED_UNCHECKED_HPP__
+
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+void LaunchNestedUncheckedTestImpl(INDEX_TYPE M)
+{
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6*M);
+
+  INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
+  INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
+  INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
+
+  INDEX_TYPE N4 = static_cast<INDEX_TYPE>(r4.end() - r4.begin());
+  INDEX_TYPE N5 = static_cast<INDEX_TYPE>(r5.end() - r5.begin());
+  INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
+
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 *                                          
+                                         N3 * N4 *
+                                         N5 * N6);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  size_t data_len = RAJA::stripIndexType(N);
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+  //6 threads total
+  const int threads_x = 2*M;
+  const int threads_y = 3*M;
+  const int threads_z = 4*M;
+
+  const int blocks_x = 4*M;
+  const int blocks_y = 5*M;
+  const int blocks_z = 6*M;
+
+  std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
+
+  const int DIM = 6;
+  using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+  RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
+
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+      RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
+          RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
+              RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
+
+                  RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
+                      RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
+                          RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
+
+                              auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
+
+
+                              Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                            });
+                        });
+                    });
+
+                });
+            });
+        });
+  });
+
+  if ( RAJA::stripIndexType(N) > 0 ) {
+
+    working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+
+  }
+    
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(LaunchNestedUncheckedTest);
+template <typename T>
+class LaunchNestedUncheckedTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(LaunchNestedUncheckedTest, RangeSegmentTeams)
+{
+
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+
+
+
+  // test zero-length range segment
+  LaunchNestedUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(0));
+
+  //Keep at one since we are doing a unchecked thread test
+  LaunchNestedUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(1));
+
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedUncheckedTest,
+                            RangeSegmentTeams);
+
+#endif  // __TEST_LAUNCH_NESTED_UNCHECKED_HPP__
diff --git a/test/functional/launch/tile_icount_loop/CMakeLists.txt b/test/functional/launch/tile_icount_tcount_direct/CMakeLists.txt
similarity index 60%
rename from test/functional/launch/tile_icount_loop/CMakeLists.txt
rename to test/functional/launch/tile_icount_tcount_direct/CMakeLists.txt
index 0074324cf6..dbe06d5bb3 100644
--- a/test/functional/launch/tile_icount_loop/CMakeLists.txt
+++ b/test/functional/launch/tile_icount_tcount_direct/CMakeLists.txt
@@ -11,12 +11,12 @@
 #
 
 foreach( BACKEND ${LAUNCH_BACKENDS} )
-    configure_file( test-launch-nested-tile-icount-loop.cpp.in
-                    test-launch-nested-Tile-iCount-Loop-${BACKEND}.cpp )
-    raja_add_test( NAME test-launch-nested-Tile-iCount-Loop-${BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-iCount-Loop-${BACKEND}.cpp )
+    configure_file( test-launch-nested-tile-icount-tcount-direct.cpp.in
+                    test-launch-nested-Tile-iCount-tCount-Direct-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-Tile-iCount-tCount-Direct-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-iCount-tCount-Direct-${BACKEND}.cpp )
 
-    target_include_directories(test-launch-nested-Tile-iCount-Loop-${BACKEND}.exe
+    target_include_directories(test-launch-nested-Tile-iCount-tCount-Direct-${BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
 
diff --git a/test/functional/launch/tile_icount_direct/test-launch-nested-tile-icount-direct.cpp.in b/test/functional/launch/tile_icount_tcount_direct/test-launch-nested-tile-icount-tcount-direct.cpp.in
similarity index 95%
rename from test/functional/launch/tile_icount_direct/test-launch-nested-tile-icount-direct.cpp.in
rename to test/functional/launch/tile_icount_tcount_direct/test-launch-nested-tile-icount-tcount-direct.cpp.in
index 575f7166a7..9216c44d5f 100644
--- a/test/functional/launch/tile_icount_direct/test-launch-nested-tile-icount-direct.cpp.in
+++ b/test/functional/launch/tile_icount_tcount_direct/test-launch-nested-tile-icount-tcount-direct.cpp.in
@@ -20,7 +20,7 @@
 //
 // Note: CMake adds ./tests as an include dir for these tests.
 //
-#include "test-launch-nested-Tile-iCount-Direct.hpp"
+#include "test-launch-nested-Tile-iCount-tCount-Direct.hpp"
 
 
 //
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp
similarity index 96%
rename from test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
rename to test/functional/launch/tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp
index 72d59d290a..0d0397bf5b 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_LAUNCH_NESTED_TILE_ICOUNT_DIRECT_HPP__
-#define __TEST_LAUNCH_NESTED_TILE_ICOUNT_DIRECT_HPP__
+#ifndef __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_DIRECT_HPP__
+#define __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_DIRECT_HPP__
 
 #include <numeric>
 
@@ -175,4 +175,4 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
                             RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_DIRECT_HPP__
diff --git a/test/functional/launch/tile_icount_direct/CMakeLists.txt b/test/functional/launch/tile_icount_tcount_loop/CMakeLists.txt
similarity index 61%
rename from test/functional/launch/tile_icount_direct/CMakeLists.txt
rename to test/functional/launch/tile_icount_tcount_loop/CMakeLists.txt
index 6270f99f1a..c454c2ffc8 100644
--- a/test/functional/launch/tile_icount_direct/CMakeLists.txt
+++ b/test/functional/launch/tile_icount_tcount_loop/CMakeLists.txt
@@ -11,12 +11,12 @@
 #
 
 foreach( BACKEND ${LAUNCH_BACKENDS} )
-    configure_file( test-launch-nested-tile-icount-direct.cpp.in
-                    test-launch-nested-Tile-iCount-Direct-${BACKEND}.cpp )
-    raja_add_test( NAME test-launch-nested-Tile-iCount-Direct-${BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-iCount-Direct-${BACKEND}.cpp )
+    configure_file( test-launch-nested-tile-icount-tcount-loop.cpp.in
+                    test-launch-nested-Tile-iCount-tCount-Loop-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-Tile-iCount-tCount-Loop-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-iCount-tCount-Loop-${BACKEND}.cpp )
 
-    target_include_directories(test-launch-nested-Tile-iCount-Direct-${BACKEND}.exe
+    target_include_directories(test-launch-nested-Tile-iCount-tCount-Loop-${BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
 
diff --git a/test/functional/launch/tile_icount_loop/test-launch-nested-tile-icount-loop.cpp.in b/test/functional/launch/tile_icount_tcount_loop/test-launch-nested-tile-icount-tcount-loop.cpp.in
similarity index 95%
rename from test/functional/launch/tile_icount_loop/test-launch-nested-tile-icount-loop.cpp.in
rename to test/functional/launch/tile_icount_tcount_loop/test-launch-nested-tile-icount-tcount-loop.cpp.in
index a29adaeaa3..790a586dbc 100644
--- a/test/functional/launch/tile_icount_loop/test-launch-nested-tile-icount-loop.cpp.in
+++ b/test/functional/launch/tile_icount_tcount_loop/test-launch-nested-tile-icount-tcount-loop.cpp.in
@@ -20,7 +20,7 @@
 //
 // Note: CMake adds ./tests as an include dir for these tests.
 //
-#include "test-launch-nested-Tile-iCount-Loop.hpp"
+#include "test-launch-nested-Tile-iCount-tCount-Loop.hpp"
 
 
 //
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp
similarity index 97%
rename from test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
rename to test/functional/launch/tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp
index 31adc84810..f15ea41035 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_LAUNCH_NESTED_TILE_ICOUNT_LOOP_hpp__
-#define __TEST_LAUNCH_NESTED_TILE_ICOUNT_LOOP_hpp__
+#ifndef __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_LOOP_hpp__
+#define __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_LOOP_hpp__
 
 #include <numeric>
 
@@ -179,4 +179,4 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
                             RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_LOOP_hpp__
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt b/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt
new file mode 100644
index 0000000000..cb01f0b926
--- /dev/null
+++ b/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt
@@ -0,0 +1,22 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+#
+
+foreach( BACKEND ${LAUNCH_UNCHECKED_BACKENDS} )
+    configure_file( test-launch-nested-tile-icount-tcount-unchecked.cpp.in
+                    test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}.cpp )
+
+    target_include_directories(test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in b/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in
new file mode 100644
index 0000000000..3ddaeff554
--- /dev/null
+++ b/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-launch-nested-Tile-iCount-tCount-Unchecked.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@LaunchNestedTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@_launch_policies>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               LaunchNestedTileUncheckedTest,
+                               @BACKEND@LaunchNestedTypes);
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp b/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
new file mode 100644
index 0000000000..49990c228d
--- /dev/null
+++ b/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
@@ -0,0 +1,138 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_UNCHECKED_HPP__
+#define __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_UNCHECKED_HPP__
+
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
+void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
+{
+
+  constexpr int threads_x   = 4;
+  const     int blocks_x    = M*4;
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x*blocks_x);
+
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_ttile_array;
+  INDEX_TYPE* check_ttile_array;
+  INDEX_TYPE* test_ttile_array;
+
+  INDEX_TYPE* working_iloop_array;
+  INDEX_TYPE* check_iloop_array;
+  INDEX_TYPE* test_iloop_array;
+
+  size_t data_len = RAJA::stripIndexType(N);
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_ttile_array,
+                                     &check_ttile_array,
+                                     &test_ttile_array);
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_iloop_array,
+                                     &check_iloop_array,
+                                     &test_iloop_array);
+
+
+  std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
+  std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
+
+  RAJA::launch<LAUNCH_POLICY>(
+    RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+      RAJA::tile_tcount<TEAM_X_POLICY>(
+        ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
+          RAJA::loop_icount<THREAD_X_POLICY>(
+            ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
+
+              working_ttile_array[tx] = bx;
+              working_iloop_array[tx] = ix;
+
+            }
+          );
+        }
+      );
+    }
+  );
+
+  if ( RAJA::stripIndexType(N) > 0 ) {
+
+    working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+
+  }
+
+  INDEX_TYPE idx = 0;
+  for (INDEX_TYPE bx = INDEX_TYPE(0); bx < blocks_x; ++bx) {
+    for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx) {
+
+      ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
+      ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
+
+      idx++;
+    }
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_ttile_array,
+                                       check_ttile_array,
+                                       test_ttile_array);
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_iloop_array,
+                                       check_iloop_array,
+                                       test_iloop_array);
+}
+
+
+TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest);
+template <typename T>
+class LaunchNestedTileUncheckedTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
+{
+
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+
+
+  // test zero-length range segment
+  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(0));
+
+  //Keep at one since we are doing a unchecked thread test
+  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(1));
+
+    LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(2));
+
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest,
+                            RangeSegmentTeams);
+
+#endif  // __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_UNCHECKED_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 7179e48fdc..5b756e00bf 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -45,7 +45,7 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 using cuda_direct_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
              RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
             >;
 
@@ -68,7 +68,7 @@ using Cuda_launch_policies =
 using hip_direct_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::hip_block_x_unchecked>,
              RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
            >;
 
diff --git a/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp
new file mode 100644
index 0000000000..da7eac0553
--- /dev/null
+++ b/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp
@@ -0,0 +1,56 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Execution policy lists used throughout launch tests
+//
+
+#ifndef __RAJA_TEST_LAUNCH_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
+#define __RAJA_TEST_LAUNCH_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using cuda_unchecked_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+            >;
+
+using cuda_unchecked_explicit_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+           >;
+
+using Cuda_launch_policies =
+  camp::list<
+             cuda_unchecked_policies,
+             cuda_unchecked_explicit_policies
+            >;
+#endif  // RAJA_ENABLE_CUDA
+
+#if defined(RAJA_ENABLE_HIP)
+
+using hip_unchecked_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::hip_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_x_unchecked>
+           >;
+
+using Hip_launch_policies = camp::list<hip_unchecked_policies>;
+
+#endif // RAJA_ENABLE_HIP
+
+
+#endif  // __RAJA_TEST_LAUNCH_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp
new file mode 100644
index 0000000000..59e16ab3e8
--- /dev/null
+++ b/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp
@@ -0,0 +1,68 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Execution policy lists used throughout launch tests
+//
+
+#ifndef __RAJA_TEST_LAUNCH_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__
+#define __RAJA_TEST_LAUNCH_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using cuda_unchecked_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_z_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_y_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_z_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_y_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+            >;
+
+using cuda_unchecked_explicit_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_z_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_y_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_z_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_y_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+            >;
+
+using Cuda_launch_policies = 
+  camp::list<
+             cuda_unchecked_policies,
+             cuda_unchecked_explicit_policies
+             >;
+
+#endif  // RAJA_ENABLE_CUDA
+
+#if defined(RAJA_ENABLE_HIP)
+
+using hip_unchecked_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::hip_block_z_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_block_y_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_z_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_y_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_x_unchecked>
+           >;
+
+using Hip_launch_policies = camp::list<hip_unchecked_policies>;
+
+#endif // RAJA_ENABLE_HIP
+
+
+#endif  //__RAJA_TEST_LAUNCH_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__

From e7a727364687b600ce4f73a925357ae55d517026 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 12 Sep 2024 11:52:25 -0700
Subject: [PATCH 12/15] Fix launch nested Tile tests

---
 .../tests/test-launch-nested-Tile-Direct.hpp  | 45 ++++++-------
 .../tests/test-launch-nested-Tile-Loop.hpp    | 67 +++++++++----------
 .../test-launch-nested-Tile-Unchecked.hpp     | 25 ++++---
 3 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 793d432987..e73897025f 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -20,17 +20,18 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   constexpr int tile_size_y = 3;
   constexpr int tile_size_z = 4;
 
-  constexpr int threads_x = 2*tile_size_x;
-  constexpr int threads_y = 3*tile_size_y;
-  constexpr int threads_z = 4*tile_size_z;
+  constexpr int threads_x = tile_size_x;
+  constexpr int threads_y = tile_size_y;
+  constexpr int threads_z = tile_size_z;
 
   constexpr int blocks_x = 4;
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z*M);
+  // Use fewer than the number of teams and threads
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, ((blocks_x-1)*threads_x+1)*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, ((blocks_y-1)*threads_y+1)*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, ((blocks_z-1)*threads_z+1)*M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -56,9 +57,10 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
                                      &check_array,
                                      &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  std::iota(test_array, test_array + data_len, 0);
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     constexpr int DIM = 3;
     using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
@@ -78,7 +80,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
                                 auto idx = tx + N1 * (ty + N2 * tz);
 
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                                Aview(tz, ty, tx) += static_cast<INDEX_TYPE>(idx);
 
                               });
                           });
@@ -90,17 +92,13 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     });
   } else { // zero-length segment
 
-    memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
-
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
     RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+        RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+            RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+                RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
 
                     RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
                         RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
@@ -119,6 +117,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.wait();
 
   if (RAJA::stripIndexType(N) > 0) {
 
@@ -153,13 +152,13 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
 
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index 07deab0376..1f565143fd 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -16,18 +16,22 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int threads_x = 2;
-  constexpr int threads_y = 3;
-  constexpr int threads_z = 4;
+  constexpr int tile_size_x = 3;
+  constexpr int tile_size_y = 4;
+  constexpr int tile_size_z = 5;
+
+  constexpr int threads_x = tile_size_x-1;
+  constexpr int threads_y = tile_size_y-1;
+  constexpr int threads_z = tile_size_z-1;
 
   constexpr int blocks_x = 4;
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  //Add one to we check the bounds checking capability
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z*M + 1);
+  // Use more than the number of teams and threads
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, (2*blocks_x*threads_x+1)*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, (2*blocks_y*threads_y+1)*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, (2*blocks_z*threads_z+1)*M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -53,9 +57,10 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
                                      &check_array,
                                      &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  std::iota(test_array, test_array + data_len, 0);
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     constexpr int DIM = 3;
     using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
@@ -65,9 +70,9 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
       (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+        RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+            RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+                RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
 
                     RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
                         RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
@@ -75,7 +80,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
                                 auto idx = tx + N1 * (ty + N2 * tz);
 
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                                Aview(tz, ty, tx) += static_cast<INDEX_TYPE>(idx);
                               });
                           });
                       });
@@ -86,27 +91,20 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     });
   } else { // zero-length segment
 
-    memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
-
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
     RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+        RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+            RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+                RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
 
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                (void) tx;
-                                (void) ty;
-                                (void) tz;
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
 
                                 working_array[0]++;
+
                               });
                           });
                       });
@@ -118,6 +116,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.wait();
 
   if (RAJA::stripIndexType(N) > 0) {
 
@@ -152,13 +151,13 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
 
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
diff --git a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
index 4702473b5e..1c48c0c7a0 100644
--- a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
+++ b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
@@ -28,9 +28,10 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
   const int blocks_y = 5*M;
   const int blocks_z = 6*M;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x*blocks_x);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y*blocks_y);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z*blocks_z);
+  // Use exactly the number of teams and threads
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x*blocks_x);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y*blocks_y);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z*blocks_z);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -53,7 +54,8 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
                                      &check_array,
                                      &test_array);
 
-  std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
+  std::iota(test_array, test_array + data_len, 0);
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
   constexpr int DIM = 3;
   using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
@@ -73,7 +75,7 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
 
                               auto idx = tx + N1 * (ty + N2 * tz);
 
-                              Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                              Aview(tz, ty, tx) += static_cast<INDEX_TYPE>(idx);
 
                             });
                         });
@@ -89,6 +91,7 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
     working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   }
+  working_res.wait();
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
     ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
@@ -115,13 +118,13 @@ TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
 
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment

From c4b466b8b4b432e24be98a61f3b7928436e4302c Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 12 Sep 2024 17:36:19 -0700
Subject: [PATCH 13/15] simplify launch testing and add waits

---
 .../tests/test-launch-nested-Direct.hpp         | 13 ++++++-------
 .../tests/test-launch-nested-Loop.hpp           | 12 +++++-------
 .../tests/test-launch-nested-Tile-Unchecked.hpp |  8 ++++----
 .../tests/test-launch-nested-Unchecked.hpp      | 15 +++++++++------
 ...-launch-nested-Tile-iCount-tCount-Direct.hpp | 17 ++++++++---------
 ...st-launch-nested-Tile-iCount-tCount-Loop.hpp | 13 ++++++-------
 ...unch-nested-Tile-iCount-tCount-Unchecked.hpp | 14 ++++++++------
 7 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index a730d030a7..3766dc403d 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -51,6 +51,10 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
                                      &working_array,
                                      &check_array,
                                      &test_array);
+
+  std::iota(test_array, test_array + data_len, 0);
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
+
   //6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
@@ -62,8 +66,6 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
   if ( RAJA::stripIndexType(N) > 0 ) {
 
-    std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
-
     constexpr int DIM = 6;
     using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
@@ -83,7 +85,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
                                 auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
 
 
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                                Aview(bz, by, bx, tz, ty, tx) += static_cast<INDEX_TYPE>(idx);
                               });
                           });
                       });
@@ -94,10 +96,6 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     });
   } else { // zero-length segment
 
-    memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
-
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
     RAJA::launch<LAUNCH_POLICY>
       (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
@@ -123,6 +121,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.wait();
 
   if (RAJA::stripIndexType(N) > 0) {
     
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index 8f3b9702d0..b9758b69fd 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -56,6 +56,9 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
                                      &check_array,
                                      &test_array);
 
+  std::iota(test_array, test_array + data_len, 0);
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
+
   //6 threads total
   constexpr int threads_x = 1;
   constexpr int threads_y = 2;
@@ -67,8 +70,6 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
   if ( RAJA::stripIndexType(N) > 0 ) {
 
-    std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
-
     constexpr int DIM = 6;
     using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
@@ -88,7 +89,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
                                 auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
 
 
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                                Aview(bz, by, bx, tz, ty, tx) += static_cast<INDEX_TYPE>(idx);
                                 
                               });
                           });
@@ -100,10 +101,6 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     });
   } else { // zero-length segment
 
-    memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
-
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
     RAJA::launch<LAUNCH_POLICY>
       (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
@@ -129,6 +126,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.wait();
 
   if (RAJA::stripIndexType(N) > 0) {
 
diff --git a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
index 1c48c0c7a0..c50a5fb267 100644
--- a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
+++ b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
@@ -55,7 +55,9 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
                                      &test_array);
 
   std::iota(test_array, test_array + data_len, 0);
-  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
+  if ( data_len > 0 ) {
+    working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
+  }
 
   constexpr int DIM = 3;
   using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
@@ -86,10 +88,8 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
         });
   });
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
-
+  if ( data_len > 0 ) {
     working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
-
   }
   working_res.wait();
 
diff --git a/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp b/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
index f3b05b31c7..ed2a096c39 100644
--- a/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
+++ b/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
@@ -48,6 +48,12 @@ void LaunchNestedUncheckedTestImpl(INDEX_TYPE M)
                                      &working_array,
                                      &check_array,
                                      &test_array);
+
+  std::iota(test_array, test_array + data_len, 0);
+  if ( data_len > 0 ) {
+    working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
+  }
+
   //6 threads total
   const int threads_x = 2*M;
   const int threads_y = 3*M;
@@ -57,8 +63,6 @@ void LaunchNestedUncheckedTestImpl(INDEX_TYPE M)
   const int blocks_y = 5*M;
   const int blocks_z = 6*M;
 
-  std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
-
   const int DIM = 6;
   using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
   RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
@@ -78,7 +82,7 @@ void LaunchNestedUncheckedTestImpl(INDEX_TYPE M)
                               auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
 
 
-                              Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                              Aview(bz, by, bx, tz, ty, tx) += static_cast<INDEX_TYPE>(idx);
                             });
                         });
                     });
@@ -88,11 +92,10 @@ void LaunchNestedUncheckedTestImpl(INDEX_TYPE M)
         });
   });
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
-
+  if ( data_len > 0 ) {
     working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
-
   }
+  working_res.wait();
     
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
     ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
diff --git a/test/functional/launch/tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp b/test/functional/launch/tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp
index 0d0397bf5b..15b39a2fb2 100644
--- a/test/functional/launch/tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_tcount_direct/tests/test-launch-nested-Tile-iCount-tCount-Direct.hpp
@@ -52,10 +52,12 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
                                      &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  std::iota(test_ttile_array, test_ttile_array + data_len, 0);
+  std::iota(test_iloop_array, test_iloop_array + data_len, 0);
+  working_res.memset(working_ttile_array, 0, sizeof(INDEX_TYPE) * data_len);
+  working_res.memset(working_iloop_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
-    std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
@@ -65,8 +67,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
             RAJA::loop_icount<THREAD_X_POLICY>(
               ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
 
-                working_ttile_array[tx] = bx;
-                working_iloop_array[tx] = ix;
+                working_ttile_array[tx] += bx;
+                working_iloop_array[tx] += ix;
 
               }
             );
@@ -77,10 +79,6 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
   } else { // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
-
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
-
     RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
@@ -102,6 +100,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
   working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
   working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.wait();
 
   if (RAJA::stripIndexType(N) > 0) {
 
diff --git a/test/functional/launch/tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp b/test/functional/launch/tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp
index f15ea41035..482ceabd6e 100644
--- a/test/functional/launch/tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_tcount_loop/tests/test-launch-nested-Tile-iCount-tCount-Loop.hpp
@@ -55,10 +55,12 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
                                      &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  std::iota(test_ttile_array, test_ttile_array + data_len, 0);
+  std::iota(test_iloop_array, test_iloop_array + data_len, 0);
+  working_res.memset(working_ttile_array, 0, sizeof(INDEX_TYPE) * data_len);
+  working_res.memset(working_iloop_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
-    std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
@@ -79,10 +81,6 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     );
   } else { // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
-
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
-
     RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
@@ -105,6 +103,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
   working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
   working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.wait();
 
   if (RAJA::stripIndexType(N) > 0) {
 
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp b/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
index 49990c228d..d9b19a83a7 100644
--- a/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
+++ b/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
@@ -45,9 +45,12 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
                                      &check_iloop_array,
                                      &test_iloop_array);
 
-
-  std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
-  std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
+  if ( data_len > 0 ) {
+    std::iota(test_ttile_array, test_ttile_array + data_len, 0);
+    std::iota(test_iloop_array, test_iloop_array + data_len, 0);
+    working_res.memset(working_ttile_array, 0, sizeof(INDEX_TYPE) * data_len);
+    working_res.memset(working_iloop_array, 0, sizeof(INDEX_TYPE) * data_len);
+  }
 
   RAJA::launch<LAUNCH_POLICY>(
     RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
@@ -67,12 +70,11 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
     }
   );
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
-
+  if ( data_len > 0 ) {
     working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
     working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
-
   }
+  working_res.wait();
 
   INDEX_TYPE idx = 0;
   for (INDEX_TYPE bx = INDEX_TYPE(0); bx < blocks_x; ++bx) {

From 8680495f1d97d3b0029f3c2f6a0e60924677b8ec Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 30 Dec 2024 13:54:38 -0800
Subject: [PATCH 14/15] Rename from unchecked to direct_unchecked

---
 docs/sphinx/user_guide/feature/policies.rst   | 79 +++++++++++--------
 include/RAJA/policy/cuda/kernel/For.hpp       |  4 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  8 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |  8 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |  6 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  | 36 ++++-----
 include/RAJA/policy/cuda/launch.hpp           | 32 ++++----
 include/RAJA/policy/cuda/policy.hpp           | 36 ++++-----
 include/RAJA/policy/hip/kernel/For.hpp        |  4 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  8 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |  8 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  6 +-
 include/RAJA/policy/hip/kernel/internal.hpp   | 38 ++++-----
 include/RAJA/policy/hip/launch.hpp            | 32 ++++----
 include/RAJA/policy/hip/policy.hpp            | 36 ++++-----
 include/RAJA/util/types.hpp                   |  9 ++-
 test/functional/kernel/CMakeLists.txt         |  8 +-
 .../test-kernel-nested-loop.cpp.in            |  4 +-
 .../test-kernel-nested-loop-segments.cpp.in   |  4 +-
 .../test-kernel-nested-loop-view.cpp.in       | 16 ++--
 .../test-kernel-nested-loop.cpp.in            |  8 +-
 .../CMakeLists.txt                            | 12 +--
 .../test-kernel-tile-count-unchecked.cpp.in   | 20 ++---
 .../test-kernel-tile-ForICount-unchecked.hpp  | 22 +++---
 .../test-kernel-tile-TileTCount-unchecked.hpp | 22 +++---
 .../tile-variants/test-kernel-tiledyn.cpp.in  |  4 +-
 .../test-kernel-tilefixed.cpp.in              |  4 +-
 .../test-kernel-tilelocal.cpp.in              |  8 +-
 test/functional/launch/CMakeLists.txt         | 14 ++--
 .../nested_tile_unchecked/CMakeLists.txt      | 12 +--
 .../test-launch-nested-tile-unchecked.cpp.in  |  6 +-
 .../test-launch-nested-Tile-Unchecked.hpp     | 22 +++---
 .../launch/nested_unchecked/CMakeLists.txt    |  4 +-
 .../test-launch-nested.cpp.in                 |  2 +-
 .../tests/test-launch-nested-Unchecked.hpp    | 22 +++---
 .../CMakeLists.txt                            | 12 +--
 ...nested-tile-icount-tcount-unchecked.cpp.in |  6 +-
 ...ch-nested-Tile-iCount-tCount-Unchecked.hpp | 24 +++---
 ...launch-direct-teams-threads-1D-execpol.hpp |  4 +-
 ...nch-unchecked-teams-threads-1D-execpol.hpp | 30 +++----
 ...nch-unchecked-teams-threads-3D-execpol.hpp | 54 ++++++-------
 41 files changed, 352 insertions(+), 342 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 8ba998f012..cbb43e4774 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -309,15 +309,16 @@ policies have the prefix ``hip_``.
 |                                                    |               | expression is executed          |
 |                                                    |               | on the device.                  |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_thread_x_unchecked                        | kernel (For)  | Map loop iterates unchecked to  |
-|                                                    | launch (loop) | GPU threads in x-dimension, one |
+| cuda/hip_thread_x_direct_unchecked                 | kernel (For)  | Map loop iterates directly      |
+|                                                    | launch (loop) | without checking loop bounds to |
+|                                                    |               | GPU threads in x-dimension, one |
 |                                                    |               | iterate per thread. See note    |
 |                                                    |               | below about limitations.        |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_thread_y_unchecked                        | kernel (For)  | Same as above, but map          |
+| cuda/hip_thread_y_direct_unchecked                 | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to threads in y-dimension.      |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_thread_z_unchecked                        | kernel (For)  | Same as above, but map          |
+| cuda/hip_thread_z_direct_unchecked                 | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to threads in z-dimension.      |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_thread_x_direct                           | kernel (For)  | Map loop iterates directly to   |
@@ -346,15 +347,16 @@ policies have the prefix ``hip_``.
 |                                                    | launch (loop) | policy, but safe to use         |
 |                                                    |               | with Cuda/HipSyncThreads.       |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_thread_size_x_unchecked<nx_threads>       | kernel (For)  | Same as thread_x_unchecked      |
-|                                                    | launch (loop) | policy above but with           |
+| cuda/hip_thread_size_x_direct_unchecked<nx_threads>| kernel (For)  | Same as                         |
+|                                                    | launch (loop) | thread_x_direct_unchecked       |
+|                                                    |               | policy above but with           |
 |                                                    |               | a compile time number of        |
 |                                                    |               | threads.                        |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_thread_size_y_unchecked<ny_threads>       | kernel (For)  | Same as above, but map          |
+| cuda/hip_thread_size_y_direct_unchecked<ny_threads>| kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to threads in y-dimension       |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_thread_size_z_unchecked<nz_threads>       | kernel (For)  | Same as above, but map          |
+| cuda/hip_thread_size_z_direct_unchecked<nz_threads>| kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to threads in z-dimension.      |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_thread_size_x_direct<nx_threads>          | kernel (For)  | Same as thread_x_direct         |
@@ -368,7 +370,7 @@ policies have the prefix ``hip_``.
 | cuda/hip_thread_size_z_direct<nz_threads>          | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to threads in z-dimension.      |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_flatten_threads_{xyz}_unchecked           | launch (loop) | Reshapes threads in a           |
+| cuda/hip_flatten_threads_{xyz}_direct_unchecked    | launch (loop) | Reshapes threads in a           |
 |                                                    |               | multi-dimensional thread        |
 |                                                    |               | team into one-dimension.        |
 |                                                    |               | Accepts any permutation         |
@@ -381,15 +383,16 @@ policies have the prefix ``hip_``.
 | cuda/hip_flatten_threads_{xyz}_loop                | launch (loop) | Same as above, but with loop    |
 |                                                    |               | mapping.                        |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_x_unchecked                         | kernel (For)  | Map loop iterates unchecked     |
-|                                                    | launch (loop) | to GPU thread blocks in the     |
+| cuda/hip_block_x_direct_unchecked                  | kernel (For)  | Map loop iterates directly      |
+|                                                    | launch (loop) | without checking loop bounds    |
+|                                                    |               | to GPU thread blocks in the     |
 |                                                    |               | x-dimension, one iterate per    |
 |                                                    |               | block.                          |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_y_unchecked                         | kernel (For)  | Same as above, but map          |
+| cuda/hip_block_y_direct_unchecked                  | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in y-dimension        |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_z_unchecked                         | kernel (For)  | Same as above, but map          |
+| cuda/hip_block_z_direct_unchecked                  | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in z-dimension        |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_block_x_direct                            | kernel (For)  | Map loop iterates directly to   |
@@ -413,14 +416,15 @@ policies have the prefix ``hip_``.
 | cuda/hip_block_z_loop                              | kernel (For)  | Same as above, but use          |
 |                                                    | launch (loop) | blocks in z-dimension           |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_size_x_unchecked<nx_blocks>         | kernel (For)  | Same as block_x_unchecked       |
+| cuda/hip_block_size_x_direct_unchecked<nx_blocks>  | kernel (For)  | Same as                         |
+|                                                    |               | block_x_direct_unchecked        |
 |                                                    | launch (loop) | policy above but with a         |
 |                                                    |               | compile time number of blocks   |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_size_y_unchecked<ny_blocks>         | kernel (For)  | Same as above, but map          |
+| cuda/hip_block_size_y_direct_unchecked<ny_blocks>  | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in y-dim              |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_block_size_z_unchecked<nz_blocks>         | kernel (For)  | Same as above, but map          |
+| cuda/hip_block_size_z_direct_unchecked<nz_blocks>  | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in z-dim              |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_block_size_x_direct<nx_blocks>            | kernel (For)  | Same as block_x_direct          |
@@ -443,8 +447,9 @@ policies have the prefix ``hip_``.
 | cuda/hip_block_size_z_loop<nz_blocks>              | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to blocks in z-dim              |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_x_unchecked                        | kernel (For)  | Map loop iterates unchecked     |
-|                                                    | launch (loop) | to GPU threads in the grid in   |
+| cuda/hip_global_x_direct_unchecked                 | kernel (For)  | Map loop iterates directly      |
+|                                                    | launch (loop) | without checking loop bounds    |
+|                                                    |               | to GPU threads in the grid in   |
 |                                                    |               | the x-dimension, one iterate    |
 |                                                    |               | per thread. Creates a unique    |
 |                                                    |               | thread id for each thread on    |
@@ -453,13 +458,14 @@ policies have the prefix ``hip_``.
 |                                                    |               | threadIdx.x +                   |
 |                                                    |               | threadDim.x * blockIdx.x.       |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_y_unchecked                        | kernel (For)  | Same as above, but uses         |
+| cuda/hip_global_y_direct_unchecked                 | kernel (For)  | Same as above, but uses         |
 |                                                    | launch (loop) | globals in y-dimension.         |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_z_unchecked                        | kernel (For)  | Same as above, but uses         |
+| cuda/hip_global_z_direct_unchecked                 | kernel (For)  | Same as above, but uses         |
 |                                                    | launch (loop) | globals in z-dimension.         |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_x_direct                           | kernel (For)  | Same as global_x_unchecked      |
+| cuda/hip_global_x_direct                           | kernel (For)  | Same as                         |
+|                                                    |               | global_x_direct_unchecked       |
 |                                                    | launch (loop) | above, but maps loop iterates   |
 |                                                    | launch (loop) | directly to GPU threads in the  |
 |                                                    |               | grid, one or no iterates per    |
@@ -482,15 +488,16 @@ policies have the prefix ``hip_``.
 | cuda/hip_global_z_loop                             | kernel (For)  | Same as above, but use          |
 |                                                    | launch (loop) | globals in z-dimension          |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_size_x_unchecked<nx_threads>       | kernel (For)  | Same as global_x_unchecked      |
+| cuda/hip_global_size_x_direct_unchecked<nx_threads>| kernel (For)  | Same as                         |
+|                                                    |               | global_x_direct_unchecked       |
 |                                                    | launch (loop) | policy above but with           |
 |                                                    |               | a compile time block            |
 |                                                    |               | size.                           |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_size_y_unchecked<ny_threads>       | kernel (For)  | Same as above, but map          |
+| cuda/hip_global_size_y_direct_unchecked<ny_threads>| kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to globals in y-dim             |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_global_size_z_unchecked<nz_threads>       | kernel (For)  | Same as above, but map          |
+| cuda/hip_global_size_z_direct_unchecked<nz_threads>| kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to globals in z-dim             |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_global_size_x_direct<nx_threads>          | kernel (For)  | Same as global_x_direct         |
@@ -515,8 +522,9 @@ policies have the prefix ``hip_``.
 | cuda/hip_global_size_z_loop<nz_threads>            | kernel (For)  | Same as above, but map          |
 |                                                    | launch (loop) | to globals in z-dim             |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_warp_unchecked                            | kernel (For)  | Map work to threads in a        |
-|                                                    |               | warp unchecked.                 |
+| cuda/hip_warp_direct_unchecked                     | kernel (For)  | Map work to threads in a        |
+|                                                    |               | warp directly without checking  |
+|                                                    |               | loop bounds. |
 |                                                    |               | Cannot be used in conjunction   |
 |                                                    |               | with cuda/hip_thread_x_*        |
 |                                                    |               | policies.                       |
@@ -525,7 +533,8 @@ policies have the prefix ``hip_``.
 |                                                    |               | cuda/hip_thread_y/z_*           |
 |                                                    |               | policies.                       |
 +----------------------------------------------------+---------------+---------------------------------+
-| cuda/hip_warp_direct                               | kernel (For)  | Similar to warp_unchecked, but  |
+| cuda/hip_warp_direct                               | kernel (For)  | Similar to                      |
+|                                                    |               | warp_direct_unchecked, but      |
 |                                                    |               | map work to threads             |
 |                                                    |               | in a warp directly.             |
 +----------------------------------------------------+---------------+---------------------------------+
@@ -589,22 +598,22 @@ policies:
 |                                                    |  BLOCKS_PER_SM_OFFSET) * sm_per_device  |
 +----------------------------------------------------+-----------------------------------------+
 
-Several notable constraints apply to RAJA CUDA/HIP *unchecked* policies.
+Several notable constraints apply to RAJA CUDA/HIP *direct_unchecked* policies.
 
-.. note:: * Unchecked policies do not mask out threads that are out-of-range.
+.. note:: * DirectUnchecked policies do not mask out threads that are out-of-range.
             So they should only be used when the size of the range matches the
             size of the block or grid.
-          * Repeating unchecked policies with the same dimension in perfectly
+          * Repeating direct_unchecked policies with the same dimension in perfectly
             nested loops is not recommended. Your code may do something, but
             likely will not do what you expect and/or be correct.
-          * If multiple unchecked policies are used in a kernel (using different
+          * If multiple direct_unchecked policies are used in a kernel (using different
             dimensions), the product of sizes of the corresponding iteration
             spaces cannot be greater than the maximum allowable threads per
             block or blocks per grid. Typically, this is 1024 threads per
             block. Attempting to execute a kernel with more than the maximum
             allowed causes the CUDA/HIP runtime to complain about
             *illegal launch parameters.*
-          * **Block-unchecked policies are recommended for most tiled loop
+          * **Block-direct-unchecked policies are recommended for most tiled loop
             patterns. In these cases the CUDA/HIP kernel is launched with the
             exact number of blocks needed so no checking is necessary.**
 
@@ -648,9 +657,9 @@ Several notes regarding CUDA/HIP *loop* policies are also good to know.
 
 Finally
 
-.. note:: CUDA/HIP block-unchecked or block-direct policies may be preferable
+.. note:: CUDA/HIP block-direct-unchecked or block-direct policies may be preferable
           to block-loop policies in situations where block load balancing may
-          be an issue as the block-unchecked or block-direct policies may yield
+          be an issue as the block-direct-unchecked or block-direct policies may yield
           better performance.
 
 Several notes regarding the CUDA/HIP policy implementation that allow you to
@@ -661,7 +670,7 @@ write more explicit policies.
             behavior of the policy.
           * Policies have a mapping from loop iterations to iterates in the
             index set via a iteration_mapping enum template parameter. The
-            possible values are Unchecked, Direct, and StridedLoop.
+            possible values are DirectUnchecked, Direct, and StridedLoop.
           * Policies can be safely used with some synchronization constructs
             via a kernel_sync_requirement enum template parameter. The
             possible values are none and sync.
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index abefacd9e5..7a6d10f4ec 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -45,7 +45,7 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -60,7 +60,7 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 85b82a9cc6..92a59cb9a8 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -32,7 +32,7 @@ namespace internal
 
 /*
  * Executor for work sharing inside CudaKernel.
- * Provides an unchecked mapping.
+ * Provides a direct unchecked mapping.
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
@@ -47,20 +47,20 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index 865e476da0..a5377f7d7d 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -58,7 +58,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
   {
@@ -69,7 +69,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -116,8 +116,8 @@ struct CudaStatementExecutor<
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
 
-    // NOTE: We do not detect improper uses of unchecked policies under tiling.
-    // This happens when using an unchecked policy on a tiled range that is not
+    // NOTE: We do not detect improper uses of direct_unchecked policies under tiling.
+    // This happens when using a direct unchecked policy on a tiled range that is not
     // evenly divisible by chunk_size.
     LaunchDims enclosed_dims =
         enclosed_stmts_t::calculateDimensions(private_data);
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 513b2fded4..a2de5e2bf3 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -60,14 +60,14 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -75,7 +75,7 @@ struct CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 09be99c506..6e3ddcdde8 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -215,9 +215,9 @@ using cuda_statement_list_executor_t = CudaStatementListExecutor<
 template<typename kernel_indexer>
 struct KernelDimensionCalculator;
 
-// specialization for unchecked sequential policies
+// specialization for direct unchecked sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -232,9 +232,9 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 
-// specialization for unchecked thread policies
+// specialization for direct unchecked thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -250,7 +250,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -262,16 +262,16 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     if ( len != static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
-// specialization for unchecked block policies
+// specialization for direct unchecked block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -286,7 +286,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -298,16 +298,16 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     if ( len != static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-// specialization for unchecked global policies
+// specialization for direct unchecked global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -323,7 +323,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -337,7 +337,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
     // BEWARE: if calculated block_size is too high then the kernel launch will fail
     const IdxT block_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
     if ( len != (block_size * static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, block_size);
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
@@ -347,7 +347,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -360,7 +360,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   {
     const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size));
     if ( len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, grid_size);
@@ -370,7 +370,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
@@ -384,7 +384,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   {
     if ( len != (static_cast<IdxT>(IndexMapper::block_size) *
                  static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index fea2845e57..574899f408 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -358,7 +358,7 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -378,7 +378,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -401,7 +401,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -623,7 +623,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
    CUDA generic loop_icount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -642,7 +642,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -667,7 +667,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -894,18 +894,18 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
    CUDA generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -931,7 +931,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -1114,7 +1114,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -1135,7 +1135,7 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -1161,7 +1161,7 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -1394,7 +1394,7 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
    CUDA generic tile_tcount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -1416,7 +1416,7 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -1446,7 +1446,7 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index b1d69bd139..4d7a7a2576 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1554,8 +1554,8 @@ using cuda_multi_reduce_atomic_low_performance_low_overhead =
 using policy::cuda::cuda_block_reduce;
 using policy::cuda::cuda_warp_reduce;
 
-using cuda_warp_unchecked = RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Unchecked,
+using cuda_warp_direct_unchecked = RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
     kernel_sync_requirement::none,
     cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
 using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
@@ -1588,8 +1588,8 @@ using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
 
 // policies usable with kernel and launch
 template < typename ... indexers >
-using cuda_indexer_unchecked = policy::cuda::cuda_indexer<
-    iteration_mapping::Unchecked,
+using cuda_indexer_direct_unchecked = policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
     kernel_sync_requirement::none,
     indexers...>;
 
@@ -1612,8 +1612,8 @@ using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     indexers...>;
 
 template < typename ... indexers >
-using cuda_flatten_indexer_unchecked = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Unchecked,
+using cuda_flatten_indexer_direct_unchecked = policy::cuda::cuda_flatten_indexer<
+    iteration_mapping::DirectUnchecked,
     kernel_sync_requirement::none,
     indexers...>;
 
@@ -1683,11 +1683,11 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
  * For example, a segment of size 1000 will only fit into 1000 threads, blocks, or global threads, and
  * triggers a runtime error in some cases.
  */
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, direct_unchecked)
 
 /*!
  * Maps segment indices to CUDA threads, blocks, or global threads.
@@ -1734,11 +1734,11 @@ RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, syncable_loop)
  * Reshapes multiple physical threads, blocks, or global threads into a 1D
  * iteration space
  */
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, direct_unchecked)
 
 /*
  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
@@ -1888,11 +1888,11 @@ RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, loop)
  * This is the lowest overhead mapping, but requires that there are the same
  * number of physical threads as the map requests.
  */
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, direct_unchecked)
 
 /*!
  * Maps segment indices to CUDA threads, blocks, or global threads.
@@ -1924,11 +1924,11 @@ RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, loop)
  * Reshapes multiple physical threads, blocks, or global threads into a 1D
  * iteration space.
  */
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, direct_unchecked)
 
 /*
  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 1f181d4590..addb556b88 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -45,7 +45,7 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -60,7 +60,7 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 30d143c6cf..3342f994e0 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -32,7 +32,7 @@ namespace internal
 
 /*
  * Executor for work sharing inside HipKernel.
- * Provides an unchecked mapping.
+ * Provides a direct unchecked mapping.
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
@@ -47,20 +47,20 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 90c147329c..55653ddfe5 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -58,7 +58,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
   {
@@ -69,7 +69,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -116,8 +116,8 @@ struct HipStatementExecutor<
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
 
-    // NOTE: We do not detect improper uses of unchecked policies under tiling.
-    // This happens when using an unchecked policy on a tiled range that is not
+    // NOTE: We do not detect improper uses of direct_unchecked policies under tiling.
+    // This happens when using a direct unchecked policy on a tiled range that is not
     // evenly divisible by chunk_size.
     LaunchDims enclosed_dims =
         enclosed_stmts_t::calculateDimensions(private_data);
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 6975c5a083..d73c71169e 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -60,14 +60,14 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -75,7 +75,7 @@ struct HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index c518d67f1f..b8a2f017b6 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -215,9 +215,9 @@ using hip_statement_list_executor_t = HipStatementListExecutor<
 template<typename kernel_indexer>
 struct KernelDimensionCalculator;
 
-// specialization for unchecked sequential policies
+// specialization for direct unchecked sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -227,14 +227,14 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
   {
     if ( len != static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
   }
 };
 
-// specialization for unchecked thread policies
+// specialization for direct unchecked thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -250,7 +250,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -262,16 +262,16 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     if ( len != static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
-// specialization for unchecked block policies
+// specialization for direct unchecked block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -286,7 +286,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -298,16 +298,16 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     if ( len != static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-// specialization for unchecked global policies
+// specialization for direct unchecked global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -323,7 +323,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -337,7 +337,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
     // BEWARE: if calculated block_size is too high then the kernel launch will fail
     const IdxT block_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
     if ( len != (block_size * static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_hip_dim<dim>(dims.threads, block_size);
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
@@ -347,7 +347,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -360,7 +360,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   {
     const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size));
     if ( len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, grid_size);
@@ -370,7 +370,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Unchecked,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
@@ -384,7 +384,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   {
     if ( len != (static_cast<IdxT>(IndexMapper::block_size) *
                  static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the unchecked mapped index space");
+      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 18ab91526d..f5b4eda529 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -358,7 +358,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -378,7 +378,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unche
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -401,7 +401,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unche
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -623,7 +623,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
    HIP generic loop_icount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -642,7 +642,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -667,7 +667,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -894,18 +894,18 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
    HIP generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -931,7 +931,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Unchecked,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -1114,7 +1114,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -1135,7 +1135,7 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unche
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -1161,7 +1161,7 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unche
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -1394,7 +1394,7 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
    HIP generic tile_tcount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -1416,7 +1416,7 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -1446,7 +1446,7 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Unchecked,
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 2a3c9176a8..6d24ab5667 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1462,8 +1462,8 @@ using hip_multi_reduce_atomic_low_performance_low_overhead =
 using policy::hip::hip_block_reduce;
 using policy::hip::hip_warp_reduce;
 
-using hip_warp_unchecked = RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Unchecked,
+using hip_warp_direct_unchecked = RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
     kernel_sync_requirement::none,
     hip::thread_x<RAJA::policy::hip::device_constants.WARP_SIZE>>;
 using hip_warp_direct = RAJA::policy::hip::hip_indexer<
@@ -1490,8 +1490,8 @@ using policy::hip::hip_launch_t;
 
 // policies usable with kernel and launch
 template < typename ... indexers >
-using hip_indexer_unchecked = policy::hip::hip_indexer<
-    iteration_mapping::Unchecked,
+using hip_indexer_direct_unchecked = policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
     kernel_sync_requirement::none,
     indexers...>;
 
@@ -1514,8 +1514,8 @@ using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     indexers...>;
 
 template < typename ... indexers >
-using hip_flatten_indexer_unchecked = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Unchecked,
+using hip_flatten_indexer_direct_unchecked = policy::hip::hip_flatten_indexer<
+    iteration_mapping::DirectUnchecked,
     kernel_sync_requirement::none,
     indexers...>;
 
@@ -1585,11 +1585,11 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
  * For example, a segment of size 1000 will only fit into 1000 threads, blocks, or global threads, and
  * triggers a runtime error in some cases.
  */
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, direct_unchecked)
 
 /*!
  * Maps segment indices to HIP threads, blocks, or global threads.
@@ -1636,11 +1636,11 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, syncable_loop)
  * Reshapes multiple physical threads, blocks, or global threads into a 1D
  * iteration space
  */
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, direct_unchecked)
 
 /*
  * Maps segment indices to flattened HIP threads, blocks, or global threads.
@@ -1788,11 +1788,11 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, loop)
  * This is the lowest overhead mapping, but requires that there are the same
  * number of physical threads as the map requests.
  */
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, direct_unchecked)
 
 /*!
  * Maps segment indices to HIP threads, blocks, or global threads.
@@ -1824,11 +1824,11 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, loop)
  * Reshapes multiple physical threads, blocks, or global threads into a 1D
  * iteration space.
  */
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, unchecked)
+RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, direct_unchecked)
 
 /*
  * Maps segment indices to flattened HIP threads, blocks, or global threads.
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 53f30fe4cb..0674db71c4 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -70,7 +70,7 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
-struct UncheckedBase {};
+struct DirectUncheckedBase {};
 struct DirectBase {};
 struct LoopBase {};
 struct ContiguousLoopBase : LoopBase {};
@@ -84,8 +84,9 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 };
 
 ///
-/// Unchecked assumes the loop has the same number of iterations and indices and
-/// maps directly without bounds checking from an iteration to an index.
+/// DirectUnchecked assumes the loop has the same number of iterations and
+/// indices and maps directly without bounds checking from an iteration to an
+/// index.
 ///
 /// For example a loop with 4 iterations mapping indices from a range of size 4.
 ///   int iterations = 4;
@@ -99,7 +100,7 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 2 -> {2}
 ///   // 3 -> {3}
 ///
-struct Unchecked : UncheckedBase {};
+struct DirectUnchecked : DirectUncheckedBase {};
 
 ///
 /// Direct assumes the loop has enough iterations for all of the indices and
diff --git a/test/functional/kernel/CMakeLists.txt b/test/functional/kernel/CMakeLists.txt
index b7951b822d..efaf08e0f7 100644
--- a/test/functional/kernel/CMakeLists.txt
+++ b/test/functional/kernel/CMakeLists.txt
@@ -6,7 +6,7 @@
 ###############################################################################
 
 list(APPEND KERNEL_BACKENDS Sequential)
-set(KERNEL_UNCHECKED_BACKENDS "")
+set(KERNEL_DIRECT_UNCHECKED_BACKENDS "")
 
 if(RAJA_ENABLE_OPENMP)
   list(APPEND KERNEL_BACKENDS OpenMP)
@@ -14,12 +14,12 @@ endif()
 
 if(RAJA_ENABLE_CUDA)
   list(APPEND KERNEL_BACKENDS Cuda)
-  list(APPEND KERNEL_UNCHECKED_BACKENDS Cuda)
+  list(APPEND KERNEL_DIRECT_UNCHECKED_BACKENDS Cuda)
 endif()
 
 if(RAJA_ENABLE_HIP)
   list(APPEND KERNEL_BACKENDS Hip)
-  list(APPEND KERNEL_UNCHECKED_BACKENDS Hip)
+  list(APPEND KERNEL_DIRECT_UNCHECKED_BACKENDS Hip)
 endif()
 
 if(RAJA_ENABLE_SYCL)
@@ -52,7 +52,7 @@ add_subdirectory(nested-loop-view-types)
 
 add_subdirectory(reduce-loc)
 
-add_subdirectory(tile-icount-tcount-unchecked)
+add_subdirectory(tile-icount-tcount-direct-unchecked)
 
 add_subdirectory(tile-icount-tcount-direct)
 
diff --git a/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in b/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in
index 6a493c2d6b..fa27eb90b6 100644
--- a/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in
+++ b/test/functional/kernel/nested-loop-reducesum/test-kernel-nested-loop.cpp.in
@@ -56,7 +56,7 @@ using CudaKernelNestedLoopExecPols = camp::list<
 
     // Device Depth 3 ReduceSum Exec Pols
     NestedLoopData<DEVICE_DEPTH_3_REDUCESUM, RAJA::cuda_block_x_loop, RAJA::cuda_thread_y_loop, RAJA::cuda_thread_z_loop >,
-    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::cuda_block_x_unchecked, RAJA::cuda_thread_y_loop >,
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::cuda_block_x_direct_unchecked, RAJA::cuda_thread_y_loop >,
     NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, RAJA::cuda_block_x_direct, RAJA::cuda_thread_x_loop, RAJA::seq_exec >
   >;
 
@@ -71,7 +71,7 @@ using HipKernelNestedLoopExecPols = camp::list<
 
     // Device Depth 3 ReduceSum Exec Pols
     NestedLoopData<DEVICE_DEPTH_3_REDUCESUM, RAJA::hip_block_x_loop, RAJA::hip_thread_y_loop, RAJA::hip_thread_z_loop >,
-    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::hip_block_x_unchecked, RAJA::hip_thread_y_loop >,
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, RAJA::seq_exec, RAJA::hip_block_x_direct_unchecked, RAJA::hip_thread_y_loop >,
     NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, RAJA::hip_block_x_direct, RAJA::hip_thread_x_loop, RAJA::seq_exec >
   >;
 
diff --git a/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in b/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
index 9e620aea4b..342c9dae83 100644
--- a/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
+++ b/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
@@ -83,7 +83,7 @@ using CudaKernelExecPols = camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::CudaKernelAsync<
       RAJA::statement::For<0, RAJA::cuda_block_z_loop,
-        RAJA::statement::For<1, RAJA::cuda_block_y_unchecked,
+        RAJA::statement::For<1, RAJA::cuda_block_y_direct_unchecked,
           RAJA::statement::For<2, RAJA::cuda_thread_x_loop,
             RAJA::statement::Lambda<0, RAJA::Segs<0, 1, 2>>
           >
@@ -113,7 +113,7 @@ using HipKernelExecPols = camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::HipKernelAsync<
       RAJA::statement::For<0, RAJA::hip_block_z_loop,
-        RAJA::statement::For<1, RAJA::hip_block_y_unchecked,
+        RAJA::statement::For<1, RAJA::hip_block_y_direct_unchecked,
           RAJA::statement::For<2, RAJA::hip_thread_x_loop,
             RAJA::statement::Lambda<0, RAJA::Segs<0, 1, 2>>
           >
diff --git a/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in b/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
index 601ddfafe7..add4aa2915 100644
--- a/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
+++ b/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
@@ -145,7 +145,7 @@ camp::list<
       RAJA::statement::Tile<0, RAJA::tile_fixed<8>,
                                RAJA::cuda_block_y_direct,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::cuda_block_x_unchecked,
+                                 RAJA::cuda_block_x_direct_unchecked,
           RAJA::statement::For<0, RAJA::cuda_thread_y_direct,   // outer
             RAJA::statement::For<1, RAJA::cuda_thread_x_direct, // inner
               RAJA::statement::Lambda<0>
@@ -176,11 +176,11 @@ camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::CudaKernelFixed<4*8*8,
       RAJA::statement::Tile<0, RAJA::tile_fixed<4>,
-                               RAJA::cuda_block_z_unchecked,
+                               RAJA::cuda_block_z_direct_unchecked,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::cuda_block_y_unchecked,
+                                 RAJA::cuda_block_y_direct_unchecked,
           RAJA::statement::Tile<2, RAJA::tile_fixed<8>,
-                                   RAJA::cuda_block_x_unchecked,
+                                   RAJA::cuda_block_x_direct_unchecked,
             RAJA::statement::For<0, RAJA::cuda_thread_z_direct,     // outer
               RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // middle
                 RAJA::statement::For<2, RAJA::cuda_thread_x_direct, // inner
@@ -216,7 +216,7 @@ camp::list<
       RAJA::statement::Tile<0, RAJA::tile_fixed<8>,
                                RAJA::hip_block_y_direct,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::hip_block_x_unchecked,
+                                 RAJA::hip_block_x_direct_unchecked,
           RAJA::statement::For<0, RAJA::hip_thread_y_direct,   // outer
             RAJA::statement::For<1, RAJA::hip_thread_x_direct, // inner
               RAJA::statement::Lambda<0>
@@ -247,11 +247,11 @@ camp::list<
   RAJA::KernelPolicy<
     RAJA::statement::HipKernelFixed<4*8*8,
       RAJA::statement::Tile<0, RAJA::tile_fixed<4>,
-                               RAJA::hip_block_z_unchecked,
+                               RAJA::hip_block_z_direct_unchecked,
         RAJA::statement::Tile<1, RAJA::tile_fixed<8>,
-                                 RAJA::hip_block_y_unchecked,
+                                 RAJA::hip_block_y_direct_unchecked,
           RAJA::statement::Tile<2, RAJA::tile_fixed<8>,
-                                   RAJA::hip_block_x_unchecked,
+                                   RAJA::hip_block_x_direct_unchecked,
             RAJA::statement::For<0, RAJA::hip_thread_z_direct,     // outer
               RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // middle
                 RAJA::statement::For<2, RAJA::hip_thread_x_direct, // inner
diff --git a/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in b/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
index d0e3166583..be5320c55d 100644
--- a/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
+++ b/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
@@ -73,11 +73,11 @@ using CudaKernelNestedLoopExecPols = camp::list<
     // Depth 2 Exec Pols
     NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_thread_x_loop, RAJA::cuda_thread_y_loop >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::seq_exec, RAJA::cuda_thread_x_loop >,
-    NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_block_y_unchecked, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_block_y_direct_unchecked, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::cuda_global_size_x_direct<32>, RAJA::cuda_global_size_y_loop<8> >,
 
     // Depth 3 Exec Pols
-    NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_thread_x_loop, RAJA::cuda_block_x_unchecked, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_thread_x_loop, RAJA::cuda_block_x_direct_unchecked, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_block_x_loop, RAJA::cuda_thread_y_loop, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_global_size_x_loop<16>, RAJA::cuda_global_size_y_direct<16>, RAJA::seq_exec >
   >;
@@ -91,11 +91,11 @@ using HipKernelNestedLoopExecPols = camp::list<
     // Depth 2 Exec Pols
     NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_thread_x_loop, RAJA::hip_thread_y_loop >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::seq_exec, RAJA::hip_thread_x_loop >,
-    NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_block_y_unchecked, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_block_y_direct_unchecked, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_2, RAJA::hip_global_size_x_direct<64>, RAJA::hip_global_size_y_loop<4> >,
 
     // Depth 3 Exec Pols
-    NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_thread_x_loop, RAJA::hip_block_x_unchecked, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_thread_x_loop, RAJA::hip_block_x_direct_unchecked, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_block_x_loop, RAJA::hip_thread_y_loop, RAJA::seq_exec >,
     NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_global_size_x_loop<32>, RAJA::hip_global_size_y_direct<8>, RAJA::seq_exec >
   >;
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt b/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt
index be28532b6e..77d9c61a33 100644
--- a/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt
@@ -16,16 +16,16 @@ set(TILESIZES 8 32)
 # 
 # Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
 #
-foreach( BACKEND ${KERNEL_UNCHECKED_BACKENDS} )
+foreach( BACKEND ${KERNEL_DIRECT_UNCHECKED_BACKENDS} )
   # using omp target crashes the compiler with this one 
   if( NOT ((BACKEND STREQUAL "OpenMPTarget")) )
     foreach( TESTTYPE ${TESTTYPES} )
       foreach( TILESIZE ${TILESIZES} )
-        configure_file( test-kernel-tile-count-unchecked.cpp.in
-                        test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked.cpp )
-        raja_add_test( NAME test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked
-                      SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked.cpp )
-        target_include_directories(test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-unchecked.exe
+        configure_file( test-kernel-tile-count-direct-unchecked.cpp.in
+                        test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct-unchecked.cpp )
+        raja_add_test( NAME test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct-unchecked
+                      SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct-unchecked.cpp )
+        target_include_directories(test-kernel-${TESTTYPE}-${TILESIZE}-${BACKEND}-direct-unchecked.exe
                                   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
       endforeach()
     endforeach()
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in b/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in
index fe526aa5a6..3680f2be59 100644
--- a/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in
@@ -18,7 +18,7 @@
 // Header for tests in ./tests directory
 //
 // Note: CMake adds ./tests as an include dir for these tests.
-#include "test-kernel-tile-@TESTTYPE@-unchecked.hpp"
+#include "test-kernel-tile-@TESTTYPE@-direct-unchecked.hpp"
 
 //
 // Note that a separate test file/executable is generated for each tile size
@@ -42,8 +42,8 @@ using CudaKernelForICountExecPols = camp::list<
 
   RAJA::KernelPolicy< 
     RAJA::statement::CudaKernel<
-      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_unchecked,
-        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_x_unchecked,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_direct_unchecked,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct_unchecked,
           RAJA::statement::Lambda<0>
         >
       >
@@ -56,8 +56,8 @@ using CudaKernelTileTCountExecPols = camp::list<
 
   RAJA::KernelPolicy<
     RAJA::statement::CudaKernel<
-      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_unchecked,
-        RAJA::statement::For<0, RAJA::cuda_thread_x_unchecked,
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::cuda_block_x_direct_unchecked,
+        RAJA::statement::For<0, RAJA::cuda_thread_x_direct_unchecked,
           RAJA::statement::Lambda<0>
         >
       >
@@ -77,8 +77,8 @@ using HipKernelForICountExecPols = camp::list<
 
   RAJA::KernelPolicy< 
     RAJA::statement::HipKernel<
-      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_unchecked,
-        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::hip_thread_x_unchecked,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_direct_unchecked,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct_unchecked,
           RAJA::statement::Lambda<0>
         >
       >
@@ -91,8 +91,8 @@ using HipKernelTileTCountExecPols = camp::list<
 
   RAJA::KernelPolicy<
     RAJA::statement::HipKernel<
-      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_unchecked,
-        RAJA::statement::For<0, RAJA::hip_thread_x_unchecked,
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::hip_block_x_direct_unchecked,
+        RAJA::statement::For<0, RAJA::hip_thread_x_direct_unchecked,
           RAJA::statement::Lambda<0>
         >
       >
@@ -115,5 +115,5 @@ using @BACKEND@KernelTile@TESTTYPE@Types =
 // Instantiate parameterized tests
 //
 INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
-                               KernelTile@TESTTYPE@UncheckedTest,
+                               KernelTile@TESTTYPE@DirectUncheckedTest,
                                @BACKEND@KernelTile@TESTTYPE@Types);
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp
index 1a831c3f12..c173369481 100644
--- a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_KERNEL_TILE_FORICOUNT_UNCHECKED_HPP__
-#define __TEST_KERNEL_TILE_FORICOUNT_UNCHECKED_HPP__
+#ifndef __TEST_KERNEL_TILE_FORICOUNT_DIRECT_UNCHECKED_HPP__
+#define __TEST_KERNEL_TILE_FORICOUNT_DIRECT_UNCHECKED_HPP__
 
 //
 // Value struct for manipulating tile sizes in parameterized tests.
@@ -18,7 +18,7 @@ struct Value {
 
 
 template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
-void KernelTileForICountUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+void KernelTileForICountDirectUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 {
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
@@ -55,14 +55,14 @@ void KernelTileForICountUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 }
 
 
-TYPED_TEST_SUITE_P(KernelTileForICountUncheckedTest);
+TYPED_TEST_SUITE_P(KernelTileForICountDirectUncheckedTest);
 template <typename T>
-class KernelTileForICountUncheckedTest : public ::testing::Test
+class KernelTileForICountDirectUncheckedTest : public ::testing::Test
 {
 };
 
 
-TYPED_TEST_P(KernelTileForICountUncheckedTest, ForICountTileKernel)
+TYPED_TEST_P(KernelTileForICountDirectUncheckedTest, ForICountTileKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -70,16 +70,16 @@ TYPED_TEST_P(KernelTileForICountUncheckedTest, ForICountTileKernel)
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
-  KernelTileForICountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileForICountDirectUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(0), tsize);
-  KernelTileForICountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileForICountDirectUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(tsize), tsize);
-  KernelTileForICountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileForICountDirectUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(13*tsize), tsize);
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileForICountUncheckedTest,
+REGISTER_TYPED_TEST_SUITE_P(KernelTileForICountDirectUncheckedTest,
                             ForICountTileKernel);
 
-#endif  // __TEST_KERNEL_TILE_FORICOUNT_UNCHECKED_HPP__
+#endif  // __TEST_KERNEL_TILE_FORICOUNT_DIRECT_UNCHECKED_HPP__
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp
index ef56efd788..9407741025 100644
--- a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp
+++ b/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_KERNEL_TILE_TILETCOUNT_UNCHECKED_HPP_
-#define __TEST_KERNEL_TILE_TILETCOUNT_UNCHECKED_HPP_
+#ifndef __TEST_KERNEL_TILE_TILETCOUNT_DIRECT_UNCHECKED_HPP_
+#define __TEST_KERNEL_TILE_TILETCOUNT_DIRECT_UNCHECKED_HPP_
 
 //
 // Value struct for manipulating tile sizes in parameterized tests.
@@ -18,7 +18,7 @@ struct Value {
 
 
 template <typename IDX_TYPE, typename EXEC_POLICY, typename REDUCE_POLICY>
-void KernelTileTileTCountUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
+void KernelTileTileTCountDirectUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 {
 
   IDX_TYPE NT = (N + tsize - 1) / tsize;
@@ -57,14 +57,14 @@ void KernelTileTileTCountUncheckedTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 }
 
 
-TYPED_TEST_SUITE_P(KernelTileTileTCountUncheckedTest);
+TYPED_TEST_SUITE_P(KernelTileTileTCountDirectUncheckedTest);
 template <typename T>
-class KernelTileTileTCountUncheckedTest : public ::testing::Test
+class KernelTileTileTCountDirectUncheckedTest : public ::testing::Test
 {
 };
 
 
-TYPED_TEST_P(KernelTileTileTCountUncheckedTest, TileTCountTileKernel)
+TYPED_TEST_P(KernelTileTileTCountDirectUncheckedTest, TileTCountTileKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -72,16 +72,16 @@ TYPED_TEST_P(KernelTileTileTCountUncheckedTest, TileTCountTileKernel)
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
-  KernelTileTileTCountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileTileTCountDirectUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(0), tsize);
-  KernelTileTileTCountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileTileTCountDirectUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(tsize), tsize);
-  KernelTileTileTCountUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
+  KernelTileTileTCountDirectUncheckedTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
     IDX_TYPE(13*tsize), tsize);
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileTileTCountUncheckedTest,
+REGISTER_TYPED_TEST_SUITE_P(KernelTileTileTCountDirectUncheckedTest,
                             TileTCountTileKernel);
 
-#endif  // __TEST_KERNEL_TILE_TILETCOUNT_UNCHECKED_HPP_
+#endif  // __TEST_KERNEL_TILE_TILETCOUNT_DIRECT_UNCHECKED_HPP_
diff --git a/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in
index 401561065c..822d3d089b 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tiledyn.cpp.in
@@ -147,7 +147,7 @@ using OpenMPTargetKernelTileExecPols =
 //    RAJA::KernelPolicy<
 //      RAJA::statement::CudaKernel<
 //        RAJA::statement::Tile<1, RAJA::tile_dynamic<1>, RAJA::seq_exec,
-//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::cuda_block_x_unchecked,
+//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::cuda_block_x_direct_unchecked,
 //            RAJA::statement::For<1, RAJA::seq_exec,
 //              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
 //                RAJA::statement::Lambda<0, RAJA::Segs<0,1>, RAJA::Params<>>
@@ -198,7 +198,7 @@ using OpenMPTargetKernelTileExecPols =
 //    RAJA::KernelPolicy<
 //      RAJA::statement::HipKernel<
 //        RAJA::statement::Tile<1, RAJA::tile_dynamic<1>, RAJA::seq_exec,
-//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::hip_block_x_unchecked,
+//          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::hip_block_x_direct_unchecked,
 //            RAJA::statement::For<1, RAJA::seq_exec,
 //              RAJA::statement::For<0, RAJA::hip_thread_x_loop,
 //                RAJA::statement::Lambda<0, RAJA::Segs<0,1>, RAJA::Params<>>
diff --git a/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
index 9204ab4548..4e3fca9704 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
@@ -147,7 +147,7 @@ using CudaKernelTileExecPols =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
         RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::seq_exec,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_x_unchecked,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_x_direct_unchecked,
             RAJA::statement::For<1, RAJA::seq_exec,
               RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
                 RAJA::statement::Lambda<0>
@@ -198,7 +198,7 @@ using HipKernelTileExecPols =
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
         RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::seq_exec,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_x_unchecked,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_x_direct_unchecked,
             RAJA::statement::For<1, RAJA::seq_exec,
               RAJA::statement::For<0, RAJA::hip_thread_x_loop,
                 RAJA::statement::Lambda<0>
diff --git a/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
index b2a1b6a9bd..c2cb62180b 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
@@ -95,8 +95,8 @@ using CudaKernelTileExecPols =
 
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::cuda_block_x_unchecked,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_y_unchecked,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::cuda_block_x_direct_unchecked,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::cuda_block_y_direct_unchecked,
 
             RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
 
@@ -132,8 +132,8 @@ using HipKernelTileExecPols =
 
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::hip_block_x_unchecked,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_y_unchecked,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::hip_block_x_direct_unchecked,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::hip_block_y_direct_unchecked,
 
             RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
 
diff --git a/test/functional/launch/CMakeLists.txt b/test/functional/launch/CMakeLists.txt
index 25a0bf9379..b6cd1b1755 100644
--- a/test/functional/launch/CMakeLists.txt
+++ b/test/functional/launch/CMakeLists.txt
@@ -6,7 +6,7 @@
 ###############################################################################
 
 list(APPEND LAUNCH_BACKENDS Sequential)
-set(LAUNCH_UNCHECKED_BACKENDS "")
+set(LAUNCH_DIRECT_UNCHECKED_BACKENDS "")
 
 if(RAJA_ENABLE_OPENMP)
   list(APPEND LAUNCH_BACKENDS OpenMP)
@@ -14,12 +14,12 @@ endif()
 
 if(RAJA_ENABLE_CUDA)
   list(APPEND LAUNCH_BACKENDS Cuda)
-  list(APPEND LAUNCH_UNCHECKED_BACKENDS Cuda)
+  list(APPEND LAUNCH_DIRECT_UNCHECKED_BACKENDS Cuda)
 endif()
 
 if(RAJA_ENABLE_HIP)
   list(APPEND LAUNCH_BACKENDS Hip)
-  list(APPEND LAUNCH_UNCHECKED_BACKENDS Hip)
+  list(APPEND LAUNCH_DIRECT_UNCHECKED_BACKENDS Hip)
 endif()
 
 if(RAJA_ENABLE_SYCL)
@@ -39,24 +39,24 @@ add_subdirectory(segment)
 
 add_subdirectory(shared_mem)
 
-add_subdirectory(nested_unchecked)
+add_subdirectory(nested_direct_unchecked)
 
 add_subdirectory(nested_direct)
 
 add_subdirectory(nested_loop)
 
-add_subdirectory(tile_icount_tcount_unchecked)
+add_subdirectory(tile_icount_tcount_direct_unchecked)
 
 add_subdirectory(tile_icount_tcount_direct)
 
 add_subdirectory(tile_icount_tcount_loop)
 
-add_subdirectory(nested_tile_unchecked)
+add_subdirectory(nested_tile_direct_unchecked)
 
 add_subdirectory(nested_tile_direct)
 
 add_subdirectory(nested_tile_loop)
 
 unset( LAUNCH_BACKENDS )
-unset( LAUNCH_UNCHECKED_BACKENDS )
+unset( LAUNCH_DIRECT_UNCHECKED_BACKENDS )
 
diff --git a/test/functional/launch/nested_tile_unchecked/CMakeLists.txt b/test/functional/launch/nested_tile_unchecked/CMakeLists.txt
index 6152dfa2ad..ef87cf263c 100644
--- a/test/functional/launch/nested_tile_unchecked/CMakeLists.txt
+++ b/test/functional/launch/nested_tile_unchecked/CMakeLists.txt
@@ -10,13 +10,13 @@
 #
 #
 
-foreach( BACKEND ${LAUNCH_UNCHECKED_BACKENDS} )
-    configure_file( test-launch-nested-tile-unchecked.cpp.in
-                    test-launch-nested-Tile-Unchecked-${BACKEND}.cpp )
-    raja_add_test( NAME test-launch-nested-Tile-Unchecked-${BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-Unchecked-${BACKEND}.cpp )
+foreach( BACKEND ${LAUNCH_DIRECT_UNCHECKED_BACKENDS} )
+    configure_file( test-launch-nested-tile-direct-unchecked.cpp.in
+                    test-launch-nested-Tile-DirectUnchecked-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-Tile-DirectUnchecked-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-DirectUnchecked-${BACKEND}.cpp )
 
-    target_include_directories(test-launch-nested-Tile-Unchecked-${BACKEND}.exe
+    target_include_directories(test-launch-nested-Tile-DirectUnchecked-${BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
 
diff --git a/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in b/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in
index 85595ac970..8239f01441 100644
--- a/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in
+++ b/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in
@@ -13,14 +13,14 @@
 #include "RAJA_test-index-types.hpp"
 
 #include "RAJA_test-forall-data.hpp"
-#include "RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp"
+#include "RAJA_test-launch-direct-unchecked-teams-threads-3D-execpol.hpp"
 
 //
 // Header for tests in ./tests directory
 //
 // Note: CMake adds ./tests as an include dir for these tests.
 //
-#include "test-launch-nested-Tile-Unchecked.hpp"
+#include "test-launch-nested-Tile-DirectUnchecked.hpp"
 
 
 //
@@ -35,5 +35,5 @@ using @BACKEND@LaunchNestedTypes =
 // Instantiate parameterized test
 //
 INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
-                               LaunchNestedTileUncheckedTest,
+                               LaunchNestedTileDirectUncheckedTest,
                                @BACKEND@LaunchNestedTypes);
diff --git a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
index c50a5fb267..3dcbea06e8 100644
--- a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
+++ b/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
@@ -5,15 +5,15 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_LAUNCH_NESTED_TILE_UNCHECKED_HPP__
-#define __TEST_LAUNCH_NESTED_TILE_UNCHECKED_HPP__
+#ifndef __TEST_LAUNCH_NESTED_TILE_DIRECT_UNCHECKED_HPP__
+#define __TEST_LAUNCH_NESTED_TILE_DIRECT_UNCHECKED_HPP__
 
 #include <numeric>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
           typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
           typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
-void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
+void LaunchNestedTileDirectUncheckedTestImpl(INDEX_TYPE M)
 {
 
   const int tile_size_x = 2;
@@ -104,14 +104,14 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
 }
 
 
-TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest);
+TYPED_TEST_SUITE_P(LaunchNestedTileDirectUncheckedTest);
 template <typename T>
-class LaunchNestedTileUncheckedTest : public ::testing::Test
+class LaunchNestedTileDirectUncheckedTest : public ::testing::Test
 {
 };
 
 
-TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
+TYPED_TEST_P(LaunchNestedTileDirectUncheckedTest, RangeSegmentTeams)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -128,13 +128,13 @@ TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+  LaunchNestedTileDirectUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
                            TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
     (INDEX_TYPE(0));
 
-  //Keep at one since we are doing a unchecked thread test
-  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+  //Keep at one since we are doing a direct unchecked thread test
+  LaunchNestedTileDirectUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
                            TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
     (INDEX_TYPE(1));
@@ -142,7 +142,7 @@ TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest,
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectUncheckedTest,
                             RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_UNCHECKED_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_UNCHECKED_HPP__
diff --git a/test/functional/launch/nested_unchecked/CMakeLists.txt b/test/functional/launch/nested_unchecked/CMakeLists.txt
index cb67616db9..cc9cd2a7cf 100644
--- a/test/functional/launch/nested_unchecked/CMakeLists.txt
+++ b/test/functional/launch/nested_unchecked/CMakeLists.txt
@@ -8,14 +8,14 @@
 #
 # List of segment types for generating test files.
 #
-set(NESTEDTYPES Unchecked)
+set(NESTEDTYPES DirectUnchecked)
 
 #
 # Generate tests for each enabled RAJA back-end.
 #
 #
 
-foreach( BACKEND ${LAUNCH_UNCHECKED_BACKENDS} )
+foreach( BACKEND ${LAUNCH_DIRECT_UNCHECKED_BACKENDS} )
   foreach( NESTEDTYPES ${NESTEDTYPES} )
     configure_file( test-launch-nested.cpp.in
                     test-launch-nested-${NESTEDTYPES}-${BACKEND}.cpp )
diff --git a/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in b/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in
index 08ec672089..066143508a 100644
--- a/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in
+++ b/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in
@@ -13,7 +13,7 @@
 #include "RAJA_test-index-types.hpp"
 
 #include "RAJA_test-forall-data.hpp"
-#include "RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp"
+#include "RAJA_test-launch-direct-unchecked-teams-threads-3D-execpol.hpp"
 
 //
 // Header for tests in ./tests directory
diff --git a/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp b/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
index ed2a096c39..49e799b2e3 100644
--- a/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
+++ b/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
@@ -5,15 +5,15 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_LAUNCH_NESTED_UNCHECKED_HPP__
-#define __TEST_LAUNCH_NESTED_UNCHECKED_HPP__
+#ifndef __TEST_LAUNCH_NESTED_DIRECT_UNCHECKED_HPP__
+#define __TEST_LAUNCH_NESTED_DIRECT_UNCHECKED_HPP__
 
 #include <numeric>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
           typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
           typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
-void LaunchNestedUncheckedTestImpl(INDEX_TYPE M)
+void LaunchNestedDirectUncheckedTestImpl(INDEX_TYPE M)
 {
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
@@ -108,14 +108,14 @@ void LaunchNestedUncheckedTestImpl(INDEX_TYPE M)
 }
 
 
-TYPED_TEST_SUITE_P(LaunchNestedUncheckedTest);
+TYPED_TEST_SUITE_P(LaunchNestedDirectUncheckedTest);
 template <typename T>
-class LaunchNestedUncheckedTest : public ::testing::Test
+class LaunchNestedDirectUncheckedTest : public ::testing::Test
 {
 };
 
 
-TYPED_TEST_P(LaunchNestedUncheckedTest, RangeSegmentTeams)
+TYPED_TEST_P(LaunchNestedDirectUncheckedTest, RangeSegmentTeams)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -133,13 +133,13 @@ TYPED_TEST_P(LaunchNestedUncheckedTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+  LaunchNestedDirectUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
                            TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
     (INDEX_TYPE(0));
 
-  //Keep at one since we are doing a unchecked thread test
-  LaunchNestedUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+  //Keep at one since we are doing a direct unchecked thread test
+  LaunchNestedDirectUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
                            TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
     (INDEX_TYPE(1));
@@ -147,7 +147,7 @@ TYPED_TEST_P(LaunchNestedUncheckedTest, RangeSegmentTeams)
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedUncheckedTest,
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectUncheckedTest,
                             RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_UNCHECKED_HPP__
+#endif  // __TEST_LAUNCH_NESTED_DIRECT_UNCHECKED_HPP__
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt b/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt
index cb01f0b926..b81a48b86e 100644
--- a/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt
+++ b/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt
@@ -10,13 +10,13 @@
 #
 #
 
-foreach( BACKEND ${LAUNCH_UNCHECKED_BACKENDS} )
-    configure_file( test-launch-nested-tile-icount-tcount-unchecked.cpp.in
-                    test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}.cpp )
-    raja_add_test( NAME test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}.cpp )
+foreach( BACKEND ${LAUNCH_DIRECT_UNCHECKED_BACKENDS} )
+    configure_file( test-launch-nested-tile-icount-tcount-direct-unchecked.cpp.in
+                    test-launch-nested-Tile-iCount-tCount-DirectUnchecked-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-Tile-iCount-tCount-DirectUnchecked-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-Tile-iCount-tCount-DirectUnchecked-${BACKEND}.cpp )
 
-    target_include_directories(test-launch-nested-Tile-iCount-tCount-Unchecked-${BACKEND}.exe
+    target_include_directories(test-launch-nested-Tile-iCount-tCount-DirectUnchecked-${BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
 
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in b/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in
index 3ddaeff554..eff627c848 100644
--- a/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in
+++ b/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in
@@ -13,14 +13,14 @@
 #include "RAJA_test-index-types.hpp"
 
 #include "RAJA_test-forall-data.hpp"
-#include "RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp"
+#include "RAJA_test-launch-direct-unchecked-teams-threads-1D-execpol.hpp"
 
 //
 // Header for tests in ./tests directory
 //
 // Note: CMake adds ./tests as an include dir for these tests.
 //
-#include "test-launch-nested-Tile-iCount-tCount-Unchecked.hpp"
+#include "test-launch-nested-Tile-iCount-tCount-DirectUnchecked.hpp"
 
 
 //
@@ -35,5 +35,5 @@ using @BACKEND@LaunchNestedTypes =
 // Instantiate parameterized test
 //
 INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
-                               LaunchNestedTileUncheckedTest,
+                               LaunchNestedTileDirectUncheckedTest,
                                @BACKEND@LaunchNestedTypes);
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp b/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
index d9b19a83a7..0cfcb3121e 100644
--- a/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
+++ b/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
@@ -5,14 +5,14 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_UNCHECKED_HPP__
-#define __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_UNCHECKED_HPP__
+#ifndef __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_DIRECT_UNCHECKED_HPP__
+#define __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_DIRECT_UNCHECKED_HPP__
 
 #include <numeric>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
           typename THREAD_X_POLICY, typename TEAM_X_POLICY>
-void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
+void LaunchNestedTileDirectUncheckedTestImpl(INDEX_TYPE M)
 {
 
   constexpr int threads_x   = 4;
@@ -99,14 +99,14 @@ void LaunchNestedTileUncheckedTestImpl(INDEX_TYPE M)
 }
 
 
-TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest);
+TYPED_TEST_SUITE_P(LaunchNestedTileDirectUncheckedTest);
 template <typename T>
-class LaunchNestedTileUncheckedTest : public ::testing::Test
+class LaunchNestedTileDirectUncheckedTest : public ::testing::Test
 {
 };
 
 
-TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
+TYPED_TEST_P(LaunchNestedTileDirectUncheckedTest, RangeSegmentTeams)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -118,23 +118,23 @@ TYPED_TEST_P(LaunchNestedTileUncheckedTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+  LaunchNestedTileDirectUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, TEAM_X_POLICY>
     (INDEX_TYPE(0));
 
-  //Keep at one since we are doing a unchecked thread test
-  LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+  //Keep at one since we are doing a direct unchecked thread test
+  LaunchNestedTileDirectUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                                  THREAD_X_POLICY, TEAM_X_POLICY>
     (INDEX_TYPE(1));
 
-    LaunchNestedTileUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+    LaunchNestedTileDirectUncheckedTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                                  THREAD_X_POLICY, TEAM_X_POLICY>
     (INDEX_TYPE(2));
 
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileUncheckedTest,
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectUncheckedTest,
                             RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_UNCHECKED_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_ICOUNT_TCOUNT_DIRECT_UNCHECKED_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 5b756e00bf..da87899469 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -45,7 +45,7 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 using cuda_direct_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>,
              RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
             >;
 
@@ -68,7 +68,7 @@ using Cuda_launch_policies =
 using hip_direct_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_block_x_direct_unchecked>,
              RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
            >;
 
diff --git a/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp
index da7eac0553..dccb077a2b 100644
--- a/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp
@@ -9,8 +9,8 @@
 // Execution policy lists used throughout launch tests
 //
 
-#ifndef __RAJA_TEST_LAUNCH_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
-#define __RAJA_TEST_LAUNCH_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
+#ifndef __RAJA_TEST_LAUNCH_DIRECT_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
+#define __RAJA_TEST_LAUNCH_DIRECT_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
 
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
@@ -18,39 +18,39 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_unchecked_policies =
+using cuda_direct_unchecked_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct_unchecked>
             >;
 
-using cuda_unchecked_explicit_policies =
+using cuda_direct_unchecked_explicit_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct_unchecked>
            >;
 
 using Cuda_launch_policies =
   camp::list<
-             cuda_unchecked_policies,
-             cuda_unchecked_explicit_policies
+             cuda_direct_unchecked_policies,
+             cuda_direct_unchecked_explicit_policies
             >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_unchecked_policies =
+using hip_direct_unchecked_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_unchecked>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_unchecked>
+             RAJA::LoopPolicy<RAJA::hip_block_x_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_x_direct_unchecked>
            >;
 
-using Hip_launch_policies = camp::list<hip_unchecked_policies>;
+using Hip_launch_policies = camp::list<hip_direct_unchecked_policies>;
 
 #endif // RAJA_ENABLE_HIP
 
 
-#endif  // __RAJA_TEST_LAUNCH_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
+#endif  // __RAJA_TEST_LAUNCH_DIRECT_UNCHECKED_TEAMS_THREADS_1D_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp
index 59e16ab3e8..125e84dd49 100644
--- a/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp
@@ -9,60 +9,60 @@
 // Execution policy lists used throughout launch tests
 //
 
-#ifndef __RAJA_TEST_LAUNCH_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__
-#define __RAJA_TEST_LAUNCH_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__
+#ifndef __RAJA_TEST_LAUNCH_DIRECT_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__
+#define __RAJA_TEST_LAUNCH_DIRECT_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__
 
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_unchecked_policies =
+using cuda_direct_unchecked_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+             RAJA::LoopPolicy<RAJA::cuda_block_z_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_y_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct_unchecked>
             >;
 
-using cuda_unchecked_explicit_policies =
+using cuda_direct_unchecked_explicit_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_unchecked>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_unchecked>
+             RAJA::LoopPolicy<RAJA::cuda_block_z_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_y_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct_unchecked>
             >;
 
 using Cuda_launch_policies = 
   camp::list<
-             cuda_unchecked_policies,
-             cuda_unchecked_explicit_policies
+             cuda_direct_unchecked_policies,
+             cuda_direct_unchecked_explicit_policies
              >;
 
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_unchecked_policies =
+using hip_direct_unchecked_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_z_unchecked>,
-             RAJA::LoopPolicy<RAJA::hip_block_y_unchecked>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_unchecked>,
-             RAJA::LoopPolicy<RAJA::hip_thread_z_unchecked>,
-             RAJA::LoopPolicy<RAJA::hip_thread_y_unchecked>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_unchecked>
+             RAJA::LoopPolicy<RAJA::hip_block_z_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_block_y_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_block_x_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_z_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_y_direct_unchecked>,
+             RAJA::LoopPolicy<RAJA::hip_thread_x_direct_unchecked>
            >;
 
-using Hip_launch_policies = camp::list<hip_unchecked_policies>;
+using Hip_launch_policies = camp::list<hip_direct_unchecked_policies>;
 
 #endif // RAJA_ENABLE_HIP
 
 
-#endif  //__RAJA_TEST_LAUNCH_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__
+#endif  //__RAJA_TEST_LAUNCH_DIRECT_UNCHECKED_TEAM_THREADS_3D_EXECPOL_HPP__

From 7bf16bae68041c5624ae3ffb0dc5a8bcc660de07 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 30 Dec 2024 14:11:52 -0800
Subject: [PATCH 15/15] rename test files

---
 .../CMakeLists.txt                                                | 0
 .../test-kernel-tile-count-direct-unchecked.cpp.in}               | 0
 .../tests/test-kernel-tile-ForICount-direct-unchecked.hpp}        | 0
 .../tests/test-kernel-tile-TileTCount-direct-unchecked.hpp}       | 0
 .../{nested_unchecked => nested_direct_unchecked}/CMakeLists.txt  | 0
 .../test-launch-nested.cpp.in                                     | 0
 .../tests/test-launch-nested-DirectUnchecked.hpp}                 | 0
 .../CMakeLists.txt                                                | 0
 .../test-launch-nested-tile-direct-unchecked.cpp.in}              | 0
 .../tests/test-launch-nested-Tile-DirectUnchecked.hpp}            | 0
 .../CMakeLists.txt                                                | 0
 ...test-launch-nested-tile-icount-tcount-direct-unchecked.cpp.in} | 0
 .../test-launch-nested-Tile-iCount-tCount-DirectUnchecked.hpp}    | 0
 ...AJA_test-launch-direct-unchecked-teams-threads-1D-execpol.hpp} | 0
 ...AJA_test-launch-direct-unchecked-teams-threads-3D-execpol.hpp} | 0
 15 files changed, 0 insertions(+), 0 deletions(-)
 rename test/functional/kernel/{tile-icount-tcount-unchecked => tile-icount-tcount-direct-unchecked}/CMakeLists.txt (100%)
 rename test/functional/kernel/{tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in => tile-icount-tcount-direct-unchecked/test-kernel-tile-count-direct-unchecked.cpp.in} (100%)
 rename test/functional/kernel/{tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp => tile-icount-tcount-direct-unchecked/tests/test-kernel-tile-ForICount-direct-unchecked.hpp} (100%)
 rename test/functional/kernel/{tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp => tile-icount-tcount-direct-unchecked/tests/test-kernel-tile-TileTCount-direct-unchecked.hpp} (100%)
 rename test/functional/launch/{nested_unchecked => nested_direct_unchecked}/CMakeLists.txt (100%)
 rename test/functional/launch/{nested_unchecked => nested_direct_unchecked}/test-launch-nested.cpp.in (100%)
 rename test/functional/launch/{nested_unchecked/tests/test-launch-nested-Unchecked.hpp => nested_direct_unchecked/tests/test-launch-nested-DirectUnchecked.hpp} (100%)
 rename test/functional/launch/{nested_tile_unchecked => nested_tile_direct_unchecked}/CMakeLists.txt (100%)
 rename test/functional/launch/{nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in => nested_tile_direct_unchecked/test-launch-nested-tile-direct-unchecked.cpp.in} (100%)
 rename test/functional/launch/{nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp => nested_tile_direct_unchecked/tests/test-launch-nested-Tile-DirectUnchecked.hpp} (100%)
 rename test/functional/launch/{tile_icount_tcount_unchecked => tile_icount_tcount_direct_unchecked}/CMakeLists.txt (100%)
 rename test/functional/launch/{tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in => tile_icount_tcount_direct_unchecked/test-launch-nested-tile-icount-tcount-direct-unchecked.cpp.in} (100%)
 rename test/functional/launch/{tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp => tile_icount_tcount_direct_unchecked/tests/test-launch-nested-Tile-iCount-tCount-DirectUnchecked.hpp} (100%)
 rename test/include/{RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp => RAJA_test-launch-direct-unchecked-teams-threads-1D-execpol.hpp} (100%)
 rename test/include/{RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp => RAJA_test-launch-direct-unchecked-teams-threads-3D-execpol.hpp} (100%)

diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt b/test/functional/kernel/tile-icount-tcount-direct-unchecked/CMakeLists.txt
similarity index 100%
rename from test/functional/kernel/tile-icount-tcount-unchecked/CMakeLists.txt
rename to test/functional/kernel/tile-icount-tcount-direct-unchecked/CMakeLists.txt
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in b/test/functional/kernel/tile-icount-tcount-direct-unchecked/test-kernel-tile-count-direct-unchecked.cpp.in
similarity index 100%
rename from test/functional/kernel/tile-icount-tcount-unchecked/test-kernel-tile-count-unchecked.cpp.in
rename to test/functional/kernel/tile-icount-tcount-direct-unchecked/test-kernel-tile-count-direct-unchecked.cpp.in
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp b/test/functional/kernel/tile-icount-tcount-direct-unchecked/tests/test-kernel-tile-ForICount-direct-unchecked.hpp
similarity index 100%
rename from test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-ForICount-unchecked.hpp
rename to test/functional/kernel/tile-icount-tcount-direct-unchecked/tests/test-kernel-tile-ForICount-direct-unchecked.hpp
diff --git a/test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp b/test/functional/kernel/tile-icount-tcount-direct-unchecked/tests/test-kernel-tile-TileTCount-direct-unchecked.hpp
similarity index 100%
rename from test/functional/kernel/tile-icount-tcount-unchecked/tests/test-kernel-tile-TileTCount-unchecked.hpp
rename to test/functional/kernel/tile-icount-tcount-direct-unchecked/tests/test-kernel-tile-TileTCount-direct-unchecked.hpp
diff --git a/test/functional/launch/nested_unchecked/CMakeLists.txt b/test/functional/launch/nested_direct_unchecked/CMakeLists.txt
similarity index 100%
rename from test/functional/launch/nested_unchecked/CMakeLists.txt
rename to test/functional/launch/nested_direct_unchecked/CMakeLists.txt
diff --git a/test/functional/launch/nested_unchecked/test-launch-nested.cpp.in b/test/functional/launch/nested_direct_unchecked/test-launch-nested.cpp.in
similarity index 100%
rename from test/functional/launch/nested_unchecked/test-launch-nested.cpp.in
rename to test/functional/launch/nested_direct_unchecked/test-launch-nested.cpp.in
diff --git a/test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp b/test/functional/launch/nested_direct_unchecked/tests/test-launch-nested-DirectUnchecked.hpp
similarity index 100%
rename from test/functional/launch/nested_unchecked/tests/test-launch-nested-Unchecked.hpp
rename to test/functional/launch/nested_direct_unchecked/tests/test-launch-nested-DirectUnchecked.hpp
diff --git a/test/functional/launch/nested_tile_unchecked/CMakeLists.txt b/test/functional/launch/nested_tile_direct_unchecked/CMakeLists.txt
similarity index 100%
rename from test/functional/launch/nested_tile_unchecked/CMakeLists.txt
rename to test/functional/launch/nested_tile_direct_unchecked/CMakeLists.txt
diff --git a/test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in b/test/functional/launch/nested_tile_direct_unchecked/test-launch-nested-tile-direct-unchecked.cpp.in
similarity index 100%
rename from test/functional/launch/nested_tile_unchecked/test-launch-nested-tile-unchecked.cpp.in
rename to test/functional/launch/nested_tile_direct_unchecked/test-launch-nested-tile-direct-unchecked.cpp.in
diff --git a/test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp b/test/functional/launch/nested_tile_direct_unchecked/tests/test-launch-nested-Tile-DirectUnchecked.hpp
similarity index 100%
rename from test/functional/launch/nested_tile_unchecked/tests/test-launch-nested-Tile-Unchecked.hpp
rename to test/functional/launch/nested_tile_direct_unchecked/tests/test-launch-nested-Tile-DirectUnchecked.hpp
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt b/test/functional/launch/tile_icount_tcount_direct_unchecked/CMakeLists.txt
similarity index 100%
rename from test/functional/launch/tile_icount_tcount_unchecked/CMakeLists.txt
rename to test/functional/launch/tile_icount_tcount_direct_unchecked/CMakeLists.txt
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in b/test/functional/launch/tile_icount_tcount_direct_unchecked/test-launch-nested-tile-icount-tcount-direct-unchecked.cpp.in
similarity index 100%
rename from test/functional/launch/tile_icount_tcount_unchecked/test-launch-nested-tile-icount-tcount-unchecked.cpp.in
rename to test/functional/launch/tile_icount_tcount_direct_unchecked/test-launch-nested-tile-icount-tcount-direct-unchecked.cpp.in
diff --git a/test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp b/test/functional/launch/tile_icount_tcount_direct_unchecked/tests/test-launch-nested-Tile-iCount-tCount-DirectUnchecked.hpp
similarity index 100%
rename from test/functional/launch/tile_icount_tcount_unchecked/tests/test-launch-nested-Tile-iCount-tCount-Unchecked.hpp
rename to test/functional/launch/tile_icount_tcount_direct_unchecked/tests/test-launch-nested-Tile-iCount-tCount-DirectUnchecked.hpp
diff --git a/test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-unchecked-teams-threads-1D-execpol.hpp
similarity index 100%
rename from test/include/RAJA_test-launch-unchecked-teams-threads-1D-execpol.hpp
rename to test/include/RAJA_test-launch-direct-unchecked-teams-threads-1D-execpol.hpp
diff --git a/test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-unchecked-teams-threads-3D-execpol.hpp
similarity index 100%
rename from test/include/RAJA_test-launch-unchecked-teams-threads-3D-execpol.hpp
rename to test/include/RAJA_test-launch-direct-unchecked-teams-threads-3D-execpol.hpp