From ed637dd200e98268de017f67dd6cc0abc6b0a493 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 26 Aug 2024 09:44:07 -0700 Subject: [PATCH] Fill out 2d and 3d loop and tile implementations There were a number missing for cuda/hip --- include/RAJA/pattern/launch/launch_core.hpp | 103 +++-- include/RAJA/policy/cuda/launch.hpp | 446 ++++++++++++++++++++ include/RAJA/policy/hip/launch.hpp | 446 ++++++++++++++++++++ 3 files changed, 971 insertions(+), 24 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index f1d70aeacb..0457285b8e 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -547,8 +547,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx, { LoopExecute, SEGMENT>::exec(ctx, - segment, - body); + segment, body); } template , SEGMENT>::exec(ctx, - segment, - body); + segment, body); } namespace expt @@ -580,9 +578,38 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx, { LoopExecute, SEGMENT>::exec(ctx, - segment0, - segment1, - body); + segment0, segment1, body); +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) +{ + + LoopICountExecute, SEGMENT>::exec(ctx, + segment0, segment1, body); +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) +{ + + LoopExecute, SEGMENT>::exec(ctx, + segment0, segment1, segment2, body); } RAJA_SUPPRESS_HD_WARN @@ -617,7 +644,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx, { LoopICountExecute, SEGMENT>::exec(ctx, - segment0, segment1, segment2, body); + segment0, segment1, segment2, body); } } //namespace expt @@ -640,9 +667,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx, { TileExecute, SEGMENT>::exec(ctx, - tile_size, - segment, - body); + tile_size, segment, body); } template , SEGMENT>::exec(ctx, - tile_size, - segment, - body); + tile_size, segment, body); } namespace expt @@ -678,11 +701,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx, { TileExecute, SEGMENT>::exec(ctx, - tile_size0, - tile_size1, - segment0, - segment1, - body); + tile_size0, tile_size1, segment0, segment1, body); } template , SEGMENT>::exec(ctx, - tile_size0, - tile_size1, - segment0, - segment1, - body); + tile_size0, tile_size1, segment0, segment1, body); +} + +template +RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx, + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) +{ + + TileExecute, SEGMENT>::exec(ctx, + tile_size0, tile_size1, tile_size2, + segment0, segment1, segment2, body); +} + +template +RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx, + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) +{ + + TileTCountExecute, SEGMENT>::exec(ctx, + tile_size0, tile_size1, tile_size2, + segment0, segment1, segment2, body); } } //namespace expt diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index 2f49f68a96..fea2845e57 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -1134,6 +1134,63 @@ struct TileExecute +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1))); + } +}; + +template +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + const diff_t i2 = IndexMapper2::template index() * static_cast(tile_size2); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2))); + } +}; + template struct TileExecute +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + + if (i0 < len0 && i1 < len1) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1))); + } + } +}; + +template +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + const diff_t i2 = IndexMapper2::template index() * static_cast(tile_size2); + + if (i0 < len0 && i1 < len1 && i2 < len2) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2))); + } + } +}; + template struct TileExecute, kernel_sync_requirement::none, @@ -1183,6 +1308,87 @@ struct TileExecute +struct TileExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t i0_init = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1_init = IndexMapper1::template index() * static_cast(tile_size1); + + const diff_t i0_stride = IndexMapper0::template size() * static_cast(tile_size0); + const diff_t i1_stride = IndexMapper1::template size() * static_cast(tile_size1); + + for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) { + for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1))); + } + } + } +}; + +template +struct TileExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1, + IndexMapper2>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t i0_init = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1_init = IndexMapper1::template index() * static_cast(tile_size1); + const diff_t i2_init = IndexMapper2::template index() * static_cast(tile_size2); + + const diff_t i0_stride = IndexMapper0::template size() * static_cast(tile_size0); + const diff_t i1_stride = IndexMapper1::template size() * static_cast(tile_size1); + const diff_t i2_stride = IndexMapper2::template size() * static_cast(tile_size2); + + for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) { + for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) { + for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2))); + } + } + } + } +}; + /* CUDA generic tile_tcount implementations @@ -1209,6 +1415,72 @@ struct TileTCountExecute +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + t0, t1); + } +}; + +template +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + const diff_t t2 = IndexMapper2::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + const diff_t i2 = t2 * static_cast(tile_size2); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2)), + t0, t1, t2); + } +}; + template struct TileTCountExecute +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + + if (i0 < len0 && i1 < len1) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + t0, t1); + } + } +}; + +template +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + const diff_t t2 = IndexMapper2::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + const diff_t i2 = t2 * static_cast(tile_size2); + + if (i0 < len0 && i1 < len1 && i2 < len2) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2)), + t0, t1, t2); + } + } +}; + template struct TileTCountExecute, kernel_sync_requirement::none, @@ -1261,5 +1610,102 @@ struct TileTCountExecute +struct TileTCountExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t t0_init = IndexMapper0::template index(); + const diff_t t1_init = IndexMapper1::template index(); + + const diff_t i0_init = t0_init * static_cast(tile_size0); + const diff_t i1_init = t1_init * static_cast(tile_size1); + + const diff_t t0_stride = IndexMapper0::template size(); + const diff_t t1_stride = IndexMapper1::template size(); + + const diff_t i0_stride = t0_stride * static_cast(tile_size0); + const diff_t i1_stride = t1_stride * static_cast(tile_size1); + + for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) { + for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + t0, t1); + } + } + } +}; + +template +struct TileTCountExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1, + IndexMapper2>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t t0_init = IndexMapper0::template index(); + const diff_t t1_init = IndexMapper1::template index(); + const diff_t t2_init = IndexMapper2::template index(); + + const diff_t i0_init = t0_init * static_cast(tile_size0); + const diff_t i1_init = t1_init * static_cast(tile_size1); + const diff_t i2_init = t2_init * static_cast(tile_size2); + + const diff_t t0_stride = IndexMapper0::template size(); + const diff_t t1_stride = IndexMapper1::template size(); + const diff_t t2_stride = IndexMapper2::template size(); + + const diff_t i0_stride = t0_stride * static_cast(tile_size0); + const diff_t i1_stride = t1_stride * static_cast(tile_size1); + const diff_t i2_stride = t2_stride * static_cast(tile_size2); + + for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) { + for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) { + for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2; i2 += i2_stride, t2 += t2_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2)), + t0, t1, t2); + } + } + } + } +}; + } // namespace RAJA #endif diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index e793899d2d..18ab91526d 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -1134,6 +1134,63 @@ struct TileExecute +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1))); + } +}; + +template +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + const diff_t i2 = IndexMapper2::template index() * static_cast(tile_size2); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2))); + } +}; + template struct TileExecute +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + + if (i0 < len0 && i1 < len1) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1))); + } + } +}; + +template +struct TileExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t i0 = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1 = IndexMapper1::template index() * static_cast(tile_size1); + const diff_t i2 = IndexMapper2::template index() * static_cast(tile_size2); + + if (i0 < len0 && i1 < len1 && i2 < len2) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2))); + } + } +}; + template struct TileExecute, kernel_sync_requirement::none, @@ -1183,6 +1308,87 @@ struct TileExecute +struct TileExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t i0_init = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1_init = IndexMapper1::template index() * static_cast(tile_size1); + + const diff_t i0_stride = IndexMapper0::template size() * static_cast(tile_size0); + const diff_t i1_stride = IndexMapper1::template size() * static_cast(tile_size1); + + for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) { + for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1))); + } + } + } +}; + +template +struct TileExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1, + IndexMapper2>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t i0_init = IndexMapper0::template index() * static_cast(tile_size0); + const diff_t i1_init = IndexMapper1::template index() * static_cast(tile_size1); + const diff_t i2_init = IndexMapper2::template index() * static_cast(tile_size2); + + const diff_t i0_stride = IndexMapper0::template size() * static_cast(tile_size0); + const diff_t i1_stride = IndexMapper1::template size() * static_cast(tile_size1); + const diff_t i2_stride = IndexMapper2::template size() * static_cast(tile_size2); + + for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) { + for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) { + for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2))); + } + } + } + } +}; + /* HIP generic tile_tcount implementations @@ -1209,6 +1415,72 @@ struct TileTCountExecute +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + t0, t1); + } +}; + +template +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + const diff_t t2 = IndexMapper2::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + const diff_t i2 = t2 * static_cast(tile_size2); + + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2)), + t0, t1, t2); + } +}; + template struct TileTCountExecute +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + + if (i0 < len0 && i1 < len1) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + t0, t1); + } + } +}; + +template +struct TileTCountExecute, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t t0 = IndexMapper0::template index(); + const diff_t t1 = IndexMapper1::template index(); + const diff_t t2 = IndexMapper2::template index(); + + const diff_t i0 = t0 * static_cast(tile_size0); + const diff_t i1 = t1 * static_cast(tile_size1); + const diff_t i2 = t2 * static_cast(tile_size2); + + if (i0 < len0 && i1 < len1 && i2 < len2) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2)), + t0, t1, t2); + } + } +}; + template struct TileTCountExecute, kernel_sync_requirement::none, @@ -1261,5 +1610,102 @@ struct TileTCountExecute +struct TileTCountExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const &segment0, + SEGMENT const &segment1, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + + const diff_t t0_init = IndexMapper0::template index(); + const diff_t t1_init = IndexMapper1::template index(); + + const diff_t i0_init = t0_init * static_cast(tile_size0); + const diff_t i1_init = t1_init * static_cast(tile_size1); + + const diff_t t0_stride = IndexMapper0::template size(); + const diff_t t1_stride = IndexMapper1::template size(); + + const diff_t i0_stride = t0_stride * static_cast(tile_size0); + const diff_t i1_stride = t1_stride * static_cast(tile_size1); + + for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) { + for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + t0, t1); + } + } + } +}; + +template +struct TileTCountExecute, + kernel_sync_requirement::none, + IndexMapper0, + IndexMapper1, + IndexMapper2>, + SEGMENT> { + + using diff_t = typename std::iterator_traits::difference_type; + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const RAJA_UNUSED_ARG(&ctx), + TILE_T tile_size0, + TILE_T tile_size1, + TILE_T tile_size2, + SEGMENT const &segment0, + SEGMENT const &segment1, + SEGMENT const &segment2, + BODY const &body) + { + const diff_t len0 = segment0.end() - segment0.begin(); + const diff_t len1 = segment1.end() - segment1.begin(); + const diff_t len2 = segment2.end() - segment2.begin(); + + const diff_t t0_init = IndexMapper0::template index(); + const diff_t t1_init = IndexMapper1::template index(); + const diff_t t2_init = IndexMapper2::template index(); + + const diff_t i0_init = t0_init * static_cast(tile_size0); + const diff_t i1_init = t1_init * static_cast(tile_size1); + const diff_t i2_init = t2_init * static_cast(tile_size2); + + const diff_t t0_stride = IndexMapper0::template size(); + const diff_t t1_stride = IndexMapper1::template size(); + const diff_t t2_stride = IndexMapper2::template size(); + + const diff_t i0_stride = t0_stride * static_cast(tile_size0); + const diff_t i1_stride = t1_stride * static_cast(tile_size1); + const diff_t i2_stride = t2_stride * static_cast(tile_size2); + + for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) { + for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) { + for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2; i2 += i2_stride, t2 += t2_stride) { + body(segment0.slice(i0, static_cast(tile_size0)), + segment1.slice(i1, static_cast(tile_size1)), + segment2.slice(i2, static_cast(tile_size2)), + t0, t1, t2); + } + } + } + } +}; + } // namespace RAJA #endif