Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpu policies that do not check loop bounds #1778

Open
wants to merge 15 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 165 additions & 36 deletions docs/sphinx/user_guide/feature/policies.rst

Large diffs are not rendered by default.

87 changes: 63 additions & 24 deletions include/RAJA/pattern/launch/launch_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -547,8 +547,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
{

LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
segment,
body);
segment, body);
}

template <typename POLICY_LIST,
Expand All @@ -561,8 +560,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
{

LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
segment,
body);
segment, body);
}

namespace expt
Expand All @@ -580,9 +578,22 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
{

LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
segment0,
segment1,
body);
segment0, segment1, body);
}

RAJA_SUPPRESS_HD_WARN
template <typename POLICY_LIST,
typename CONTEXT,
typename SEGMENT,
typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
SEGMENT const &segment0,
SEGMENT const &segment1,
BODY const &body)
{

LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
segment0, segment1, body);
}

RAJA_SUPPRESS_HD_WARN
Expand Down Expand Up @@ -617,7 +628,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
{

LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
segment0, segment1, segment2, body);
segment0, segment1, segment2, body);
}

} //namespace expt
Expand All @@ -640,9 +651,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
{

TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
tile_size,
segment,
body);
tile_size, segment, body);
}

template <typename POLICY_LIST,
Expand All @@ -656,9 +665,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
BODY const &body)
{
TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
tile_size,
segment,
body);
tile_size, segment, body);
}

namespace expt
Expand All @@ -678,11 +685,44 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
{

TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
tile_size0,
tile_size1,
segment0,
segment1,
body);
tile_size0, tile_size1, segment0, segment1, body);
}

template <typename POLICY_LIST,
typename CONTEXT,
typename TILE_T,
typename SEGMENT,
typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
TILE_T tile_size0,
TILE_T tile_size1,
SEGMENT const &segment0,
SEGMENT const &segment1,
BODY const &body)
{

TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
tile_size0, tile_size1, segment0, segment1, body);
}

template <typename POLICY_LIST,
typename CONTEXT,
typename TILE_T,
typename SEGMENT,
typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
TILE_T tile_size0,
TILE_T tile_size1,
TILE_T tile_size2,
SEGMENT const &segment0,
SEGMENT const &segment1,
SEGMENT const &segment2,
BODY const &body)
{

TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
tile_size0, tile_size1, tile_size2,
segment0, segment1, segment2, body);
}

template <typename POLICY_LIST,
Expand All @@ -693,17 +733,16 @@ template <typename POLICY_LIST,
RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
TILE_T tile_size0,
TILE_T tile_size1,
TILE_T tile_size2,
SEGMENT const &segment0,
SEGMENT const &segment1,
SEGMENT const &segment2,
BODY const &body)
{

TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
tile_size0,
tile_size1,
segment0,
segment1,
body);
tile_size0, tile_size1, tile_size2,
segment0, segment1, segment2, body);
}

} //namespace expt
Expand Down
59 changes: 59 additions & 0 deletions include/RAJA/policy/cuda/kernel/For.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,65 @@ namespace RAJA
namespace internal
{

/*
* Executor for work sharing inside CudaKernel.
* Mapping without checking from IndexMapper to indices
* Assigns the loop index to offset ArgumentId
* Meets all sync requirements
*/
template <typename Data,
camp::idx_t ArgumentId,
typename IndexMapper,
kernel_sync_requirement sync,
typename... EnclosedStmts,
typename Types>
struct CudaStatementExecutor<
Data,
statement::For<ArgumentId,
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types> {

using stmt_list_t = StatementList<EnclosedStmts...>;

// Set the argument type for this loop
using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;

using enclosed_stmts_t =
CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;

using diff_t = segment_diff_type<ArgumentId, Data>;

using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;

static inline RAJA_DEVICE
void exec(Data &data, bool thread_active)
{
const diff_t i = IndexMapper::template index<diff_t>();

// Assign the index to the argument
data.template assign_offset<ArgumentId>(i);

// execute enclosed statements
enclosed_stmts_t::exec(data, thread_active);
}

static inline
LaunchDims calculateDimensions(Data const &data)
{
const diff_t len = segment_length<ArgumentId>(data);

CudaDims my_dims(0), my_min_dims(0);
DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
LaunchDims dims{my_dims, my_min_dims};

// combine with enclosed statements
LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
return dims.max(enclosed_dims);
}
};

/*
* Executor for work sharing inside CudaKernel.
* Mapping directly from IndexMapper to indices
Expand Down
52 changes: 52 additions & 0 deletions include/RAJA/policy/cuda/kernel/ForICount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,58 @@ namespace RAJA
namespace internal
{

/*
* Executor for work sharing inside CudaKernel.
* Provides a direct unchecked mapping.
* Assigns the loop index to offset ArgumentId
* Assigns the loop index to param ParamId
* Meets all sync requirements
*/
template <typename Data,
camp::idx_t ArgumentId,
typename ParamId,
typename IndexMapper,
kernel_sync_requirement sync,
typename... EnclosedStmts,
typename Types>
struct CudaStatementExecutor<
Data,
statement::ForICount<ArgumentId, ParamId,
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types>
: CudaStatementExecutor<
Data,
statement::For<ArgumentId,
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types> {

using Base = CudaStatementExecutor<
Data,
statement::For<ArgumentId,
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types>;

using typename Base::enclosed_stmts_t;
using typename Base::diff_t;

static inline RAJA_DEVICE
void exec(Data &data, bool thread_active)
{
// grid stride loop
const diff_t i = IndexMapper::template index<diff_t>();

// Assign the index to the argument and param
data.template assign_offset<ArgumentId>(i);
data.template assign_param<ParamId>(i);

// execute enclosed statements
enclosed_stmts_t::exec(data, thread_active);
}
};

/*
* Executor for work sharing inside CudaKernel.
* Provides a direct mapping.
Expand Down
84 changes: 84 additions & 0 deletions include/RAJA/policy/cuda/kernel/Tile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,90 @@ namespace RAJA
namespace internal
{

/*!
* A specialized RAJA::kernel cuda_impl executor for statement::Tile
* Assigns the tile segment to segment ArgumentId
* Meets all sync requirements
*/
template <typename Data,
camp::idx_t ArgumentId,
camp::idx_t chunk_size,
typename IndexMapper,
kernel_sync_requirement sync,
typename... EnclosedStmts,
typename Types>
struct CudaStatementExecutor<
Data,
statement::Tile<ArgumentId,
RAJA::tile_fixed<chunk_size>,
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types>
{

using stmt_list_t = StatementList<EnclosedStmts...>;

using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;

using diff_t = segment_diff_type<ArgumentId, Data>;

using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;

static inline RAJA_DEVICE
void exec(Data &data, bool thread_active)
{
// Get the segment referenced by this Tile statement
auto &segment = camp::get<ArgumentId>(data.segment_tuple);

using segment_t = camp::decay<decltype(segment)>;

// compute trip count
const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);

// Keep copy of original segment, so we can restore it
segment_t orig_segment = segment;

// Assign our new tiled segment
segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));

// execute enclosed statements
enclosed_stmts_t::exec(data, thread_active);

// Set range back to original values
segment = orig_segment;
}

static inline
LaunchDims calculateDimensions(Data const &data)
{
// Compute how many chunks
const diff_t full_len = segment_length<ArgumentId>(data);
const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));

CudaDims my_dims(0), my_min_dims(0);
DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
LaunchDims dims{my_dims, my_min_dims};

// privatize data, so we can mess with the segments
using data_t = camp::decay<Data>;
data_t private_data = data;

// Get original segment
auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);

// restrict to first tile
segment = segment.slice(0, static_cast<diff_t>(chunk_size));

// NOTE: We do not detect improper uses of direct_unchecked policies under tiling.
// This happens when using a direct unchecked policy on a tiled range that is not
// evenly divisible by chunk_size.
LaunchDims enclosed_dims =
enclosed_stmts_t::calculateDimensions(private_data);

return dims.max(enclosed_dims);
}
};

/*!
* A specialized RAJA::kernel cuda_impl executor for statement::Tile
* Assigns the tile segment to segment ArgumentId
Expand Down
Loading
Loading