From a774acdcc0b36b74d805652df7af24707d17fb8d Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Wed, 9 Jun 2021 23:55:40 +0300 Subject: [PATCH] Fixes #92, regards #91: Dropped the `grid_info` namespace. Also, preferring `lane::id()` over `lane::index()` when the latter was previously used. --- README.md | 14 ++-- src/kat/on_device/collaboration/block.cuh | 33 ++------ src/kat/on_device/collaboration/grid.cuh | 12 +-- src/kat/on_device/collaboration/warp.cuh | 18 ++--- src/kat/on_device/grid_info.cuh | 31 +++----- src/kat/on_device/ranges.cuh | 16 ++-- src/kat/on_device/sequence_ops/block.cuh | 28 +++---- src/kat/on_device/sequence_ops/grid.cuh | 1 - src/kat/on_device/sequence_ops/warp.cuh | 10 +-- src/kat/on_device/shared_memory/basic.cuh | 4 +- .../on_device/streams/prefix_generators.cuh | 6 +- .../on_device/streams/printfing_ostream.cuh | 6 +- tests/atomics.cu | 34 ++++---- tests/block_collaboration.cu | 25 +++--- tests/grid_collaboration.cu | 8 +- tests/printing.cu | 16 ++-- tests/ranges.cu | 2 +- tests/sequence_ops.cu | 78 +++++++++---------- tests/warp_collaboration.cu | 50 ++++++------ 19 files changed, 180 insertions(+), 212 deletions(-) diff --git a/README.md b/README.md index 909e03f..76a06c1 100644 --- a/README.md +++ b/README.md @@ -92,14 +92,14 @@ num_full_warps_in_grid_block() ``` the library groups these (and many other related) functions into relevant namespaces. We thus have: ``` -linear_grid::grid_info::thread::global_id() -grid_info::lane::id() -linear_grid::grid_info::thread::is_last_in_block() -linear_grid::grid_info::block::num_full_warps() +linear_grid::thread::global_id() +lane::id() +linear_grid::thread::is_last_in_block() +linear_grid::block::num_full_warps() ``` -which is easier to browse through if you use auto-complete. The order comes at the expense of brevity... but we can alleviate this with an appropriate `namespace`. The above can then become simply: +which is easier to browse through if you use auto-complete. The order comes at the expense of brevity... but we can alleviate this somewhat with a shorter namespace. The above can then become simply: ``` -namespace gi = kat::linear_grid::grid_info; +namespace gi = kat::linear_grid; gi::thread::global_id() gi::lane::id() gi::thread::is_last_in_block() @@ -107,7 +107,7 @@ gi::block::num_full_warps() ``` in your code. Now _this_ is how I want to write my kernels! -You will note, that most similar phrases you could come up with about positions and sizes within the grid - already have implementations. For example: "I can get the number of full warps, but now I want the number of warps, period"; well, just replace `num_full_warps()` with `num_warps()` and it's there: `linear_grid::grid_info::block::num_warps()` is available. +You will note, that most similar phrases you could come up with about positions and sizes within the grid - already have implementations. For example: "I can get the number of full warps, but now I want the number of warps, period"; well, just replace `num_full_warps()` with `num_warps()` and it's there: `linear_grid::block::num_warps()` is available. And as a final bonus - if you write a non-linear kernel, with blocks and grids having y and z dimensions other than 1 - you will only need to change your `namespace =` or `using` statements, to be able to write the same code and use 3-D implementations of these functions instead. diff --git a/src/kat/on_device/collaboration/block.cuh b/src/kat/on_device/collaboration/block.cuh index 92cd40e..4862587 100644 --- a/src/kat/on_device/collaboration/block.cuh +++ b/src/kat/on_device/collaboration/block.cuh @@ -32,18 +32,6 @@ namespace collaborative { namespace block { -///@cond -// If we want to refer to other primitives, we'll make those references explicit; -// but we do want to be able to say `warp::id()` without prefixing that with anything. - -namespace grid = grid_info::grid; -namespace block = grid_info::block; -namespace warp = grid_info::warp; -namespace thread = grid_info::thread; -namespace lane = grid_info::lane; - -///@endcond - /* * TODO: Implement * KAT_FD unsigned all_satisfy(unsigned int predicate, unsigned* scratch_area); @@ -71,8 +59,8 @@ KAT_FD void share_per_warp_data( T* __restrict__ where_to_make_available, unsigned writing_lane_id) { - if (lane::index() == writing_lane_id) { - where_to_make_available[warp::id()] = datum; + if (lane::id() == writing_lane_id) { + where_to_make_available[kat::warp::id()] = datum; } if (Synchronize) __syncthreads(); } @@ -144,18 +132,6 @@ namespace linear_grid { namespace collaborative { namespace block { -///@cond -// If we want to refer to other collaboration primitives, we'll make those references explicit; -// but we do want to be able to say `warp::id()` without prefixing that with anything. - -namespace grid = grid_info::grid; -namespace block = grid_info::block; -namespace warp = grid_info::warp; -namespace thread = grid_info::thread; -namespace lane = grid_info::lane; - -///@endcond - /* * TODO: Implement * KAT_FD unsigned all_satisfy(unsigned int predicate, unsigned* scratch_area); @@ -206,8 +182,9 @@ KAT_FD void share_per_warp_data( T* __restrict__ where_to_make_available, unsigned writing_lane_id) { - if (lane::index() == writing_lane_id) { - where_to_make_available[warp::id()] = datum; + namespace gi = kat::linear_grid; + if (gi::lane::id() == writing_lane_id) { + where_to_make_available[gi::warp::id()] = datum; } if (Synchronize) __syncthreads(); } diff --git a/src/kat/on_device/collaboration/grid.cuh b/src/kat/on_device/collaboration/grid.cuh index 6c79c45..2c096fb 100644 --- a/src/kat/on_device/collaboration/grid.cuh +++ b/src/kat/on_device/collaboration/grid.cuh @@ -32,11 +32,11 @@ namespace grid { // If we want to refer to other primitives, we'll make those references explicit; // but we do want to be able to say `warp::index()` without prefixing that with anything. -namespace grid = kat::linear_grid::grid_info::grid; -namespace block = kat::linear_grid::grid_info::block; -namespace warp = kat::linear_grid::grid_info::warp; -namespace thread = kat::linear_grid::grid_info::thread; -namespace lane = kat::linear_grid::grid_info::lane; +namespace grid = kat::linear_grid::grid; +namespace block = kat::linear_grid::block; +namespace warp = kat::linear_grid::warp; +namespace thread = kat::linear_grid::thread; +namespace lane = kat::linear_grid::lane; /** * Have all kernel threads perform some action over the linear range @@ -92,7 +92,7 @@ namespace warp_per_input_element { * * @note This version of `at_grid_stride` is specific to linear grids, * even though the text of its code looks the same as that of - * @ref kat::grid_info::collaborative::warp::at_grid_stride . + * @ref kat::collaborative::warp::at_grid_stride . * * @param length The length of the range of positions on which to act * @param f The callable for warps to use each position in the sequence diff --git a/src/kat/on_device/collaboration/warp.cuh b/src/kat/on_device/collaboration/warp.cuh index a48c66a..8d1e46b 100644 --- a/src/kat/on_device/collaboration/warp.cuh +++ b/src/kat/on_device/collaboration/warp.cuh @@ -78,8 +78,6 @@ KAT_FD int last_lane_in(lane_mask_t mask) namespace collaborative { namespace warp { -namespace lane = grid_info::lane; - /** * @brief Guarantees all memory writes by (mask-specified) warp lanes are visible to the other (mask-specified) lanes. * @@ -296,13 +294,13 @@ KAT_FD T get_from_lane(T value, int source_lane) template KAT_FD T get_from_first_lane(T value) { - return get_from_lane(value, grid_info::warp::first_lane); + return get_from_lane(value, kat::warp::first_lane); } template KAT_FD T get_from_last_lane(T value) { - return get_from_lane(value, grid_info::warp::last_lane); + return get_from_lane(value, kat::warp::last_lane); } /** @@ -422,13 +420,13 @@ KAT_FD typename std::result_of::type have_a_single_lane_compute(Func template KAT_FD typename std::result_of::type have_first_lane_compute(Function f) { - return have_a_single_lane_compute(f, grid_info::warp::first_lane); + return have_a_single_lane_compute(f, kat::warp::first_lane); } template KAT_FD typename std::result_of::type have_last_lane_compute(Function f) { - return have_a_single_lane_compute(f, grid_info::warp::last_lane); + return have_a_single_lane_compute(f, kat::warp::last_lane); } KAT_FD unsigned index_among_active_lanes() @@ -554,11 +552,11 @@ KAT_FD search_result_t multisearch(const T& lane_needle, const T& lane_hay_st unsigned lower, upper; // lower is inclusive, upper is exclusive } bounds; if (lane_needle <= lane_hay_straw) { - bounds.lower = grid_info::warp::first_lane; - bounds.upper = grid_info::lane::id(); + bounds.lower = kat::warp::first_lane; + bounds.upper = kat::lane::id(); } else { - bounds.lower = grid_info::lane::id() + 1; + bounds.lower = lane::id() + 1; bounds.upper = warp_size; } enum : unsigned { cutoff_to_linear_search = 6 }; @@ -680,7 +678,7 @@ template < auto full_warp_writes_output_length = (PossibilityOfSlack == detail::has_no_slack) ? full_warp_reads_output_length : round_down_to_full_warps(full_warp_reads_output_length); - const auto lane_index = grid_info::lane::id(); + const auto lane_index = lane::id(); promoted_size_t input_pos = lane_index; diff --git a/src/kat/on_device/grid_info.cuh b/src/kat/on_device/grid_info.cuh index a9466f3..cc794fe 100644 --- a/src/kat/on_device/grid_info.cuh +++ b/src/kat/on_device/grid_info.cuh @@ -175,8 +175,6 @@ constexpr KAT_FHD bool dimensionality_is_canonical(dimensions_t dims) // TODO: Perhaps have functions for strided copy in and out -namespace grid_info { - namespace detail { template @@ -267,7 +265,7 @@ KAT_FD grid_block_dimension_t size() { return size(dimensions()); } KAT_FD position_t first_thread_position() { return position_t{0, 0, 0}; } template -KAT_FD position_t last_thread_position() { return grid_info::detail::last_position_for(blockDim); } +KAT_FD position_t last_thread_position() { return detail::last_position_for(blockDim); } template KAT_FD grid_block_dimension_t @@ -306,7 +304,7 @@ namespace grid { */ KAT_FD bool is_linear() { - return gridDim.y == 1 and gridDim.z == 1 and grid_info::block::is_linear(); + return gridDim.y == 1 and gridDim.z == 1 and block::is_linear(); } // TODO: Consider templatizing this on the dimensions too @@ -428,11 +426,11 @@ KAT_FD position_t global_index() { return position_in_grid -KAT_FD unsigned id_in_block() { return grid_info::thread::id_in_block() / warp_size; } +KAT_FD unsigned id_in_block() { return thread::id_in_block() / warp_size; } template KAT_FD unsigned index_in_block() { return id_in_block(); } template -KAT_FD unsigned id_in_grid() { return grid_info::thread::id_in_grid() / warp_size; } +KAT_FD unsigned id_in_grid() { return thread::id_in_grid() / warp_size; } template KAT_FD unsigned index() { return index_in_block(); } template @@ -531,13 +529,10 @@ KAT_FD bool is_last_in_warp() { return lane::id() == warp: } // namespace thread -} // namespace grid_info // I couldn't use '1d, 2d, 3d since those aren't valid identifiers... namespace linear_grid { -namespace grid_info { - namespace grid { // TODO: Should we use the same return types as for the non-linear case? @@ -556,7 +551,7 @@ KAT_FD grid_dimension_t first_last_position() { return index_of_last_block( namespace block { -using kat::grid_info::block::dimensions; +using kat::block::dimensions; KAT_FD unsigned index_in_grid() { return blockIdx.x; } KAT_FD grid_block_dimension_t index() { return index_in_grid(); } KAT_FD unsigned id_in_grid() { return index_in_grid(); } @@ -605,10 +600,10 @@ KAT_FD unsigned num_warps_per_block() { return block::num_warps(); } namespace warp { -using kat::grid_info::warp::first_lane; -using kat::grid_info::warp::last_lane; -using kat::grid_info::warp::size; -using kat::grid_info::warp::length; +using kat::warp::first_lane; +using kat::warp::last_lane; +using kat::warp::size; +using kat::warp::length; } @@ -632,8 +627,8 @@ KAT_FD bool is_last_in_block() { return index_in_block() == KAT_FD bool is_first_in_grid() { return block::is_first_in_grid() and thread::is_first_in_block(); } KAT_FD bool is_last_in_grid() { return block::is_last_in_grid() and thread::is_last_in_block(); } -using ::kat::grid_info::thread::is_first_in_warp; -using ::kat::grid_info::thread::is_last_in_warp; +using ::kat::thread::is_first_in_warp; +using ::kat::thread::is_last_in_warp; /** @@ -714,7 +709,7 @@ namespace lane { // directly - we have to separate the code for the linear-grid and // non-linear-grid cases. -enum { half_warp_size = kat::grid_info::lane::half_warp_size }; +enum { half_warp_size = kat::lane::half_warp_size }; KAT_FD unsigned id_of(unsigned thread_index) { @@ -750,8 +745,6 @@ KAT_FD unsigned is_in_second_half_warp() { return id_in_warp() >= half_warp_siz } // namespace lane -} // namespace grid_info - } // namespace linear_grid } // namespace kat diff --git a/src/kat/on_device/ranges.cuh b/src/kat/on_device/ranges.cuh index 424d753..6265459 100644 --- a/src/kat/on_device/ranges.cuh +++ b/src/kat/on_device/ranges.cuh @@ -28,7 +28,7 @@ template KAT_DEV kat::ranges::strided warp_stride(Size length) { constexpr const auto stride = warp_size; - auto begin = grid_info::lane::id(); + auto begin = lane::id(); return ::kat::ranges::strided>(begin, length, stride); } @@ -58,7 +58,7 @@ template KAT_DEV kat::ranges::strided warp_stride(Size length) { constexpr const auto stride = warp_size; - auto begin = grid_info::lane::id(); + auto begin = lane::id(); return ::kat::ranges::strided>(begin, length, stride); } @@ -76,8 +76,8 @@ KAT_DEV kat::ranges::strided warp_stride(Size length) template KAT_DEV kat::ranges::strided block_stride(Size length) { - const auto stride = linear_grid::grid_info::block::size(); - const auto begin = grid_info::thread::id_in_block(); + const auto stride = linear_grid::block::size(); + const auto begin = thread::id_in_block(); return ::kat::ranges::strided>(begin, length, stride); } @@ -94,8 +94,8 @@ KAT_DEV kat::ranges::strided block_stride(Size length) template KAT_DEV kat::ranges::strided grid_stride(Size length) { - const auto stride = grid_info::grid::total_size(); - const auto begin = grid_info::thread::global_id(); + const auto stride = grid::total_size(); + const auto begin = thread::global_id(); return ::kat::ranges::strided>(begin, length, stride); } @@ -104,8 +104,8 @@ namespace warp_per_input_element { template KAT_DEV kat::ranges::strided grid_stride(Size length) { - const auto stride = grid_info::grid::num_warps(); - const auto begin = grid_info::warp::global_id(); + const auto stride = grid::num_warps(); + const auto begin = warp::global_id(); return ::kat::ranges::strided>(begin, length, stride); } diff --git a/src/kat/on_device/sequence_ops/block.cuh b/src/kat/on_device/sequence_ops/block.cuh index a06e498..a30547a 100644 --- a/src/kat/on_device/sequence_ops/block.cuh +++ b/src/kat/on_device/sequence_ops/block.cuh @@ -255,14 +255,14 @@ template< bool AllThreadsObtainResult = false> KAT_DEV T reduce(T value, AccumulationOp op) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; constexpr const T neutral_value {}; static __shared__ T warp_reductions[warp_size]; auto intra_warp_result = kat::collaborative::warp::reduce(value, op); - collaborative::block::share_per_warp_data(intra_warp_result, warp_reductions, gi::warp::first_lane); + collaborative::block::share_per_warp_data(intra_warp_result, warp_reductions, kat::warp::first_lane); // Note: assuming here that there are at most 32 warps per block; // if/when this changes, more warps may need to be involved in this second @@ -271,7 +271,7 @@ KAT_DEV T reduce(T value, AccumulationOp op) if (not AllThreadsObtainResult) { // We currently only guarantee the first thread has the final result, // which is what allows most threads to return already: - if (not gi::warp::is_first_in_block()) { return neutral_value; } + if (not kat::warp::is_first_in_block()) { return neutral_value; } } collaborative::block::barrier(); // Perhaps we can do with something weaker here? @@ -279,8 +279,8 @@ KAT_DEV T reduce(T value, AccumulationOp op) // shared memory now holds all intra-warp reduction results // read from shared memory only if that warp actually existed - auto other_warp_result = (gi::lane::id() < gi::block::num_warps()) ? - warp_reductions[gi::lane::id()] : neutral_value; + auto other_warp_result = (lane::id() < gi::block::num_warps()) ? + warp_reductions[lane::id()] : neutral_value; return kat::collaborative::warp::reduce(other_warp_result, op); // TODO: Would it perhaps be faster to have only one warp compute this, @@ -317,8 +317,8 @@ KAT_DEV T scan(T value, AccumulationOp op, T* __restrict__ scratch) T, AccumulationOp, inclusivity_t::Inclusive >(value, op); auto last_active_lane_id = - // (AssumeFullWarps or not grid_info::warp::is_last_in_block()) ? - warp::last_lane + // (AssumeFullWarps or not warp::is_last_in_block()) ? + kat::warp::last_lane // : collaborative::warp::last_active_lane_index() ; @@ -335,7 +335,7 @@ KAT_DEV T scan(T value, AccumulationOp op, T* __restrict__ scratch) // scratch buffer now holds all full-warp _reductions_; - if (warp::is_first_in_block()) { + if (kat::warp::is_first_in_block()) { // Note that for a block with less than warp_size warps, some of the lanes // here will read junk data from the scratch area; but that's not a problem, // since these values will not effect the scan results of previous lanes, @@ -349,7 +349,7 @@ KAT_DEV T scan(T value, AccumulationOp op, T* __restrict__ scratch) collaborative::block::barrier(); - auto r = scratch[warp::id()]; + auto r = scratch[kat::warp::id()]; T intra_warp_scan_result; if (Inclusivity == inclusivity_t::Inclusive) { intra_warp_scan_result = intra_warp_inclusive_scan_result; @@ -416,8 +416,8 @@ template < T, AccumulationOp, inclusivity_t::Inclusive>(value, op); auto last_active_lane_id = - // (AssumeFullWarps or not grid_info::warp::is_last_in_block()) ? - warp::last_lane + // (AssumeFullWarps or not warp::is_last_in_block()) ? + kat::warp::last_lane // : collaborative::warp::last_active_lane_index() ; @@ -433,12 +433,12 @@ template < // scratch[i] now contains the reduction result of the data of all threads in // the i'th warp of this block - auto num_warps = block::num_warps(); + auto num_warps = kat::block::num_warps(); auto partial_reduction_result = scratch[num_warps - 1]; // We're keeping this single-warp reduction result, since it will soon // be overwritten - if (warp::is_first_in_block()) { + if (kat::warp::is_first_in_block()) { // Note that for a block with less than warp_size warps, some of the lanes // here will read junk data from the scratch area; but that's not a problem, // since these values will not effect the scan results of previous lanes, @@ -460,7 +460,7 @@ template < // We had kept the last warp's reduction result, now we've taken // the other warps into account as well - auto partial_scan_result = scratch[warp::id()]; // only a partial result for now + auto partial_scan_result = scratch[kat::warp::id()]; // only a partial result for now // To finalize the computation, we now account for the requested scan inclusivity diff --git a/src/kat/on_device/sequence_ops/grid.cuh b/src/kat/on_device/sequence_ops/grid.cuh index 50b72f5..2bcd855 100644 --- a/src/kat/on_device/sequence_ops/grid.cuh +++ b/src/kat/on_device/sequence_ops/grid.cuh @@ -55,7 +55,6 @@ KAT_FD void append_to_global_memory( T* __restrict__ fragment_to_append, Size __restrict__ fragment_length) { - using namespace grid_info; auto lane_to_perform_atomic_op = collaborative::warp::select_leader_lane(); auto f = [&]() { return atomic::add(global_output_length, fragment_length); diff --git a/src/kat/on_device/sequence_ops/warp.cuh b/src/kat/on_device/sequence_ops/warp.cuh index 3c7c077..0c95f83 100644 --- a/src/kat/on_device/sequence_ops/warp.cuh +++ b/src/kat/on_device/sequence_ops/warp.cuh @@ -299,10 +299,10 @@ KAT_FD void copy_n( Size length, T* __restrict__ target) { - using namespace linear_grid::grid_info; + using namespace linear_grid; enum { elements_per_lane_in_full_warp_write = - collaborative::detail::elements_per_lane_in_full_warp_write::value + kat::collaborative::detail::elements_per_lane_in_full_warp_write::value }; if ((elements_per_lane_in_full_warp_write == 1) or @@ -323,7 +323,7 @@ KAT_FD void copy_n( // TODO: Should I pragma-unroll this by a fixed amount? Should // I not specify an unroll at all? #pragma unroll - for(promoted_size_t pos = lane::index() * elements_per_lane_in_full_warp_write; + for(promoted_size_t pos = kat::lane::index() * elements_per_lane_in_full_warp_write; pos < truncated_length; pos += warp_size * elements_per_lane_in_full_warp_write) { @@ -340,8 +340,8 @@ KAT_FD void copy_n( } else { auto num_slack_elements = length - truncated_length; - if (lane::index() < num_slack_elements) { - auto pos = truncated_length + lane::index(); + if (kat::lane::index() < num_slack_elements) { + auto pos = truncated_length + kat::lane::index(); target[pos] = source[pos]; } } diff --git a/src/kat/on_device/shared_memory/basic.cuh b/src/kat/on_device/shared_memory/basic.cuh index 2fc471b..a6f3c08 100644 --- a/src/kat/on_device/shared_memory/basic.cuh +++ b/src/kat/on_device/shared_memory/basic.cuh @@ -124,7 +124,7 @@ template KAT_FD T* contiguous(unsigned num_elements_per_warp, offset_t base_offset = 0) { return proxy() + base_offset + - num_elements_per_warp * linear_grid::grid_info::warp::index_in_block(); + num_elements_per_warp * linear_grid::warp::index_in_block(); } /** @@ -144,7 +144,7 @@ KAT_FD T* contiguous(unsigned num_elements_per_warp, offset_t base_offset = 0) template KAT_FD T* strided(offset_t base_offset = 0) { - return proxy() + base_offset + linear_grid::grid_info::warp::index_in_block(); + return proxy() + base_offset + linear_grid::warp::index_in_block(); } } // namespace warp_specific diff --git a/src/kat/on_device/streams/prefix_generators.cuh b/src/kat/on_device/streams/prefix_generators.cuh index dfb8590..b44d79a 100644 --- a/src/kat/on_device/streams/prefix_generators.cuh +++ b/src/kat/on_device/streams/prefix_generators.cuh @@ -37,7 +37,7 @@ KAT_DEV void self_identify(kat::stringstream& ss); template <> KAT_DEV void self_identify(kat::stringstream& ss) { - namespace gi = ::kat::linear_grid::grid_info; + namespace gi = ::kat::linear_grid; const auto global_thread_id_width = detail::num_digits_required_for(gi::grid::num_threads() - 1); const auto block_id_width = detail::num_digits_required_for(gi::grid::num_blocks() - 1); @@ -63,7 +63,7 @@ KAT_DEV void self_identify(kat::stringstr template <> KAT_DEV void self_identify(kat::stringstream& ss) { - namespace gi = ::kat::linear_grid::grid_info; + namespace gi = ::kat::linear_grid; auto global_warp_id_width = detail::num_digits_required_for(gi::grid::num_warps() - 1); auto warp_id_width = detail::num_digits_required_for(gi::grid::num_warps_per_block() - 1); @@ -85,7 +85,7 @@ KAT_DEV void self_identify(kat::stringstrea template <> KAT_DEV void self_identify(kat::stringstream& ss) { - namespace gi = ::kat::linear_grid::grid_info; + namespace gi = ::kat::linear_grid; const unsigned block_id_width = detail::num_digits_required_for(gi::grid::num_blocks() - 1); constexpr const auto fill_char = '0'; diff --git a/src/kat/on_device/streams/printfing_ostream.cuh b/src/kat/on_device/streams/printfing_ostream.cuh index 109c437..d7488b6 100644 --- a/src/kat/on_device/streams/printfing_ostream.cuh +++ b/src/kat/on_device/streams/printfing_ostream.cuh @@ -84,9 +84,9 @@ protected: // rather than assuming the first one is. switch(r) { case resolution::thread: return true; - case resolution::warp: return grid_info::thread::is_first_in_warp(); - case resolution::block: return grid_info::thread::is_first_in_block(); - case resolution::grid: return grid_info::thread::is_first_in_grid(); + case resolution::warp: return thread::is_first_in_warp(); + case resolution::block: return thread::is_first_in_block(); + case resolution::grid: return thread::is_first_in_grid(); default: return false; // but can't get here } } diff --git a/tests/atomics.cu b/tests/atomics.cu index 2cd0d01..f99aa9d 100644 --- a/tests/atomics.cu +++ b/tests/atomics.cu @@ -598,7 +598,7 @@ TEST_CASE_TEMPLATE("min - random values from host", T, INTEGER_TYPES, FLOAT_TYPE T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -648,7 +648,7 @@ TEST_CASE_TEMPLATE("max - random values from host", T, INTEGER_TYPES, FLOAT_TYPE T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -694,7 +694,7 @@ TEST_CASE_TEMPLATE("min - single outlier", T, INTEGER_TYPES, FLOAT_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -741,7 +741,7 @@ TEST_CASE_TEMPLATE("max - single outlier", T, INTEGER_TYPES, FLOAT_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -787,7 +787,7 @@ TEST_CASE_TEMPLATE("logical_and - single outlier", T, INTEGER_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -831,7 +831,7 @@ TEST_CASE_TEMPLATE("logical_or - single outlier", T, INTEGER_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -877,7 +877,7 @@ TEST_CASE_TEMPLATE("logical_xor - single outlier 0", T, INTEGER_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -922,7 +922,7 @@ TEST_CASE_TEMPLATE("logical_xor - single outlier 1", T, INTEGER_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -965,7 +965,7 @@ TEST_CASE_TEMPLATE("logical_not - single non-negator", T, INTEGER_TYPES) { size_t, T* __restrict aggregate) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; // TODO: Should I sleep here? Use a block::barrier()? if (not (outlier_pos == gi::thread::global_id()) ) { @@ -1006,7 +1006,7 @@ TEST_CASE_TEMPLATE("logical_not - single negater", T, INTEGER_TYPES) { size_t, T* __restrict aggregate) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; if (outlier_pos == gi::thread::global_id() ) { kat::atomic::logical_not(aggregate); @@ -1052,7 +1052,7 @@ TEST_CASE_TEMPLATE("logical_not - by random threads", T, INTEGER_TYPES) { T* __restrict target, const fake_bool* __restrict perform_op_indicators) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; bool perform_op = perform_op_indicators[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -1106,7 +1106,7 @@ TEST_CASE_TEMPLATE("bitwise_and - single outliers", T, INTEGER_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -1156,7 +1156,7 @@ TEST_CASE_TEMPLATE("bitwise_or - single outliers", T, INTEGER_TYPES) { T* __restrict aggregate, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -1203,7 +1203,7 @@ TEST_CASE_TEMPLATE("bitwise_xor - random values from host", T, INTEGER_TYPES) { T* __restrict target, const T* __restrict input_data) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_element = input_data[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -1252,7 +1252,7 @@ TEST_CASE_TEMPLATE("bitwise_not - by random threads", T, INTEGER_TYPES) { T* __restrict target, const fake_bool* __restrict perform_op_indicators) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; bool perform_op = perform_op_indicators[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -1319,7 +1319,7 @@ TEST_CASE_TEMPLATE("set_bit - few outliers", T, long int) { // INTEGER_TYPES) { const bit_index_type* __restrict bit_indices ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto bit_index = bit_indices[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? @@ -1384,7 +1384,7 @@ TEST_CASE_TEMPLATE("unset_bit - few outliers", T, long int) { // INTEGER_TYPES) const bit_index_type* __restrict bit_indices ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto bit_index = bit_indices[gi::thread::global_index()]; // TODO: Should I sleep here? Use a block::barrier()? diff --git a/tests/block_collaboration.cu b/tests/block_collaboration.cu index d8906b6..6a0c01b 100644 --- a/tests/block_collaboration.cu +++ b/tests/block_collaboration.cu @@ -215,7 +215,7 @@ TEST_CASE("at_block_stride") checked_value_type* pos_attendent_thread_indices ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto offset_into_attendant_array = length_to_cover_per_block * gi::block::id(); auto f_inner = [&] (size_t pos) { pos_attendent_thread_indices[offset_into_attendant_array + pos] = gi::thread::index_in_grid(); @@ -264,20 +264,21 @@ TEST_CASE("share_per_warp_data - specific writer lane") datum_type* warp_data_for_all_blocks ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; datum_type thread_datum { make_warp_datum(gi::block::id(), gi::warp::id()) }; // same for all threads in warp! constexpr auto max_possible_num_warps_per_block = 32; // Note: Important assumption here... - __shared__ datum_type warp_data [max_possible_num_warps_per_block]; + __shared__ datum_type block_warps_data [max_possible_num_warps_per_block]; constexpr const auto writing_lane_index = 3u; // just for kicks - klcb::share_per_warp_data(thread_datum, warp_data, writing_lane_index); + klcb::share_per_warp_data(thread_datum, block_warps_data, writing_lane_index); // We've run the synchronized variant, so no need for extra sync if (gi::thread::is_first_in_block()) { + // Now we're populating what's going to be checked outside the kernel. auto warp_data_for_this_block = warp_data_for_all_blocks + gi::block::id() * num_warps_per_block; for(int i = 0; i < num_warps_per_block; i++) { - warp_data_for_this_block[i] = warp_data[i]; + warp_data_for_this_block[i] = block_warps_data[i]; } } }; @@ -321,7 +322,7 @@ TEST_CASE("share_per_warp_data - inspecific writer lane") datum_type* warp_data_for_all_blocks ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; datum_type thread_datum { make_warp_datum(gi::block::id(), gi::warp::id()) }; // same for all threads in warp! constexpr auto max_possible_num_warps_per_block = 32; // Note: Important assumption here... @@ -382,7 +383,7 @@ TEST_CASE("get_from_thread") datum_type* thread_obtained_values ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; datum_type thread_datum { make_thread_datum(gi::block::id(), gi::thread::id()) }; auto source_thread_index { make_source_thread_index(gi::block::id()) }; auto obtained_value { klcb::get_from_thread(thread_datum, source_thread_index) }; @@ -430,7 +431,7 @@ TEST_CASE("get_from_first_thread") datum_type* thread_obtained_values ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; datum_type thread_datum { make_thread_datum(gi::block::id(), gi::thread::id()) }; auto obtained_value { klcb::get_from_first_thread(thread_datum) }; // We've run the synchronized variant, so no need for extra sync @@ -488,7 +489,7 @@ TEST_CASE("share_per_warp_data - specific writer lane") datum_type* warp_data_for_all_blocks ) { - namespace gi = kat::grid_info; + namespace gi = kat; datum_type thread_datum { make_warp_datum(gi::block::id(), gi::warp::id()) }; constexpr auto max_possible_num_warps_per_block = 32; // Note: Important assumption here... __shared__ datum_type warp_data [max_possible_num_warps_per_block]; @@ -545,7 +546,7 @@ TEST_CASE("share_per_warp_data - inspecific writer lane") datum_type* warp_data_for_all_blocks ) { - namespace gi = kat::grid_info; + namespace gi = kat; datum_type thread_datum { make_warp_datum(gi::block::id(), gi::warp::id()) }; constexpr auto max_possible_num_warps_per_block = 32; // Note: Important assumption here... __shared__ datum_type warp_data [max_possible_num_warps_per_block]; @@ -607,7 +608,7 @@ TEST_CASE("get_from_thread") datum_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat; datum_type thread_datum { make_thread_datum(gi::block::id(), gi::thread::id()) }; auto source_thread_index { make_source_thread_index(gi::block::id()) }; auto obtained_value { kcb::get_from_thread(thread_datum, source_thread_index) }; @@ -659,7 +660,7 @@ TEST_CASE("get_from_first_thread") datum_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat; datum_type thread_datum { make_thread_datum(gi::block::id(), gi::thread::id()) }; auto obtained_value { kcb::get_from_first_thread(thread_datum) }; // We've run the synchronized variant, so no need for extra sync diff --git a/tests/grid_collaboration.cu b/tests/grid_collaboration.cu index d1c7729..31e3bdf 100644 --- a/tests/grid_collaboration.cu +++ b/tests/grid_collaboration.cu @@ -201,7 +201,7 @@ TEST_CASE("at_grid_stride") auto testcase_device_function = [] KAT_DEV (size_t length, checked_value_type* results) { auto f_inner = [&] (size_t pos) { - results[pos] = kat::linear_grid::grid_info::thread::id_in_grid(); + results[pos] = kat::linear_grid::thread::id_in_grid(); }; klcg::at_grid_stride(length, f_inner); }; @@ -243,10 +243,10 @@ TEST_CASE("at_block_stride") auto testcase_device_function = [] KAT_DEV (size_t length, checked_value_type* results) { auto f_inner = [&] (size_t pos) { // printf("Thread %u in block %u got pos %u of %u\n", threadIdx.x, blockIdx.x, (unsigned) pos, (unsigned) length); - results[pos] = kat::linear_grid::grid_info::thread::id_in_grid(); + results[pos] = kat::linear_grid::thread::id_in_grid(); }; auto serialization_factor = - length / kat::linear_grid::grid_info::grid::num_threads() + (length % kat::linear_grid::grid_info::grid::num_threads() != 0); + length / kat::linear_grid::grid::num_threads() + (length % kat::linear_grid::grid::num_threads() != 0); klcg::at_block_stride(length, f_inner, serialization_factor); }; @@ -320,7 +320,7 @@ TEST_CASE("warp_per_input_element::at_grid_stride") size_t length_of_attending_threads_info, checked_value_type* attending_threads_info) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; const auto my_index = gi::thread::id_in_grid(); auto grid_size_minus_my_index = gi::grid::num_threads() - my_index; auto f_inner = [&] (size_t pos) { diff --git a/tests/printing.cu b/tests/printing.cu index 599bcd4..ffe7bd2 100644 --- a/tests/printing.cu +++ b/tests/printing.cu @@ -105,14 +105,14 @@ __global__ void use_printfing_ostream() cout.flush(); kat::collaborative::block::barrier(); - if (kat::linear_grid::grid_info::thread::index_in_block() == 0) { + if (kat::linear_grid::thread::index_in_block() == 0) { printf("All threads in block %d have flushed cout.\n", blockIdx.x); } cout << "String literal 2 with newline - to be printed on use of flush manipulator\n"; cout << kat::flush; kat::collaborative::block::barrier(); - if (kat::linear_grid::grid_info::thread::index_in_block() == 0) { + if (kat::linear_grid::thread::index_in_block() == 0) { printf("All threads in block %d have streamed the flush manipulator to their cout.\n", blockIdx.x); } @@ -122,7 +122,7 @@ __global__ void use_printfing_ostream() __global__ void printfing_ostream_settings() { kat::printfing_ostream cout; - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; cout << "Before any setting\n"; cout.flush(); @@ -134,7 +134,7 @@ __global__ void printfing_ostream_settings() cout << "SHOULD NOT see \\n between threads' printouts of this sentence. "; cout.flush(); - if (kat::linear_grid::grid_info::thread::is_first_in_grid()) { + if (kat::linear_grid::thread::is_first_in_grid()) { cout << '\n'; cout.flush(); } @@ -158,7 +158,7 @@ __global__ void stream_manipulators_into_printfing_ostream() { kat::printfing_ostream cout; using kat::flush; - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; cout << "Before any setting\n" << flush; // TODO: What if the text is big enough to cause recycling? That shouldn't matter, but we should try it. @@ -167,7 +167,7 @@ __global__ void stream_manipulators_into_printfing_ostream() << kat::manipulators::no_newline_on_flush << "SHOULD NOT see \\n between threads' printouts of this sentence. " << flush; - if (kat::linear_grid::grid_info::thread::is_first_in_grid()) { + if (kat::linear_grid::thread::is_first_in_grid()) { // This will just add a newline after the long paragraph of many threads' non-newline-terminated strings. cout << kat::manipulators::endl; } @@ -202,7 +202,7 @@ __global__ void print_at_different_resolutions() { kat::printfing_ostream cout; using kat::flush; - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; cout << kat::manipulators::resolution(kat::printfing_ostream::resolution::grid); cout << "Printing at grid resolution. The printing thread is (" << blockIdx.x << "," << threadIdx.x << ")\n" << flush; @@ -230,7 +230,7 @@ __global__ void self_identifying_printfing_ostream() { kat::printfing_ostream cout; using kat::flush; - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; self_identifying_printfing_ostream_for_resolution(cout, kat::printfing_ostream::resolution::grid); diff --git a/tests/ranges.cu b/tests/ranges.cu index 40f6ded..8c7764b 100644 --- a/tests/ranges.cu +++ b/tests/ranges.cu @@ -348,7 +348,7 @@ TEST_CASE("irange coverage") for(auto pos : kat::irange(a, b)) { hash = silly_hash_next(hash, pos); } - values_to_populate[kat::linear_grid::grid_info::thread::global_id()] = hash; + values_to_populate[kat::linear_grid::thread::global_id()] = hash; }; auto expected_value_retriever = [=] (size_t pos) -> tc_type { diff --git a/tests/sequence_ops.cu b/tests/sequence_ops.cu index eb92a23..0fa521a 100644 --- a/tests/sequence_ops.cu +++ b/tests/sequence_ops.cu @@ -324,7 +324,7 @@ TEST_CASE("fill") size_t, checked_value_type* buffer_to_fill_by_entire_grid ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = buffer_to_fill_by_entire_grid + length_to_cover_per_block * gi::block::id(); auto end = start + length_to_cover_per_block; auto fill_value = resolve_fill_value(gi::block::id()); @@ -363,7 +363,7 @@ TEST_CASE("fill_n") { size_t, checked_value_type* buffer_to_fill_by_entire_grid ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = buffer_to_fill_by_entire_grid + length_to_cover_per_block * gi::block::id(); auto fill_value = resolve_fill_value(gi::block::id()); klcb::fill_n(start, length_to_cover_per_block, fill_value); @@ -396,7 +396,7 @@ TEST_CASE("memzero") { size_t, checked_value_type* buffer_to_fill_by_entire_grid ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = buffer_to_fill_by_entire_grid + length_to_cover_per_block * gi::block::id(); auto end = start + length_to_cover_per_block; klcb::memzero(start, end); @@ -428,7 +428,7 @@ TEST_CASE("memzero_n") { size_t, checked_value_type* buffer_to_fill_by_entire_grid ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = buffer_to_fill_by_entire_grid + length_to_cover_per_block * gi::block::id(); klcb::memzero_n(start, length_to_cover_per_block); }; @@ -469,7 +469,7 @@ TEST_CASE("transform") { const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_block * gi::block::id(); auto source_end = source_start + length_to_cover_per_block; auto block_target_start = target + length_to_cover_per_block * gi::block::id(); @@ -513,7 +513,7 @@ TEST_CASE("transform_n") { const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_block * gi::block::id(); auto block_target_start = target + length_to_cover_per_block * gi::block::id(); klcb::transform_n(source_start, length_to_cover_per_block, block_target_start, op); @@ -554,7 +554,7 @@ TEST_CASE("cast_and_copy") { const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_block * gi::block::id(); auto source_end = source_start + length_to_cover_per_block; auto block_target_start = target + length_to_cover_per_block * gi::block::id(); @@ -596,7 +596,7 @@ TEST_CASE("cast_and_copy_n") { const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = input + length_to_cover_per_block * gi::block::id(); auto block_target_start = target + length_to_cover_per_block * gi::block::id(); klcb::cast_and_copy_n(start, length_to_cover_per_block, block_target_start); @@ -638,7 +638,7 @@ TEST_CASE("copy") { const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_block * gi::block::id(); auto source_end = source_start + length_to_cover_per_block; auto block_target_start = target + length_to_cover_per_block * gi::block::id(); @@ -680,7 +680,7 @@ TEST_CASE("copy_n") { const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = input + length_to_cover_per_block * gi::block::id(); auto block_target_start = target + length_to_cover_per_block * gi::block::id(); klcb::copy_n(start, length_to_cover_per_block, block_target_start); @@ -726,7 +726,7 @@ TEST_CASE("lookup") { const index_type* __restrict indices ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto block_indices_start = indices + num_indices_per_block * gi::block::id(); auto block_target_start = target + num_indices_per_block * gi::block::id(); klcb::lookup(block_target_start, data, block_indices_start, num_indices_per_block); @@ -782,7 +782,7 @@ TEST_CASE_TEMPLATE("reduce - all threads obtain result", InputAndResultTypes, st // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; target[gi::thread::global_id()] = @@ -852,7 +852,7 @@ TEST_CASE_TEMPLATE("reduce - not all threads obtain result", InputAndResultTypes // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; target[gi::thread::global_id()] = @@ -920,7 +920,7 @@ TEST_CASE_TEMPLATE("sum - all threads obtain result", InputAndResultTypes, std:: // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; target[gi::thread::global_id()] = klcb::sum(thread_input); @@ -986,7 +986,7 @@ TEST_CASE_TEMPLATE("inclusive scan with specified scratch area", InputAndResultT // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; static __shared__ checked_value_type scratch[kat::warp_size]; // assumes that there are no than warp_size warps per block @@ -1053,7 +1053,7 @@ TEST_CASE_TEMPLATE("inclusive scan without specified scratch area", InputAndResu // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; target[gi::thread::global_id()] = @@ -1119,7 +1119,7 @@ TEST_CASE_TEMPLATE("exclusive scan with specified scratch area", InputAndResultT // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; static __shared__ checked_value_type scratch[kat::warp_size]; // assumes that there are no than warp_size warps per block const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; @@ -1185,7 +1185,7 @@ TEST_CASE_TEMPLATE("exclusive scan without specified scratch area", InputAndResu // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; target[gi::thread::global_id()] = @@ -1253,7 +1253,7 @@ TEST_CASE_TEMPLATE("inclusive scan_and_reduce with specified scratch area", Inpu // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; static __shared__ scan_result_type scratch[kat::warp_size]; // assumes that there are no than warp_size warps per block const auto plus = [](scan_result_type& x, scan_result_type y) { x += y; }; @@ -1329,7 +1329,7 @@ TEST_CASE_TEMPLATE("exclusive scan_and_reduce with specified scratch area", Inpu // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; static __shared__ scan_result_type scratch[kat::warp_size]; // assumes that there are no than warp_size warps per block const auto plus = [](scan_result_type& x, scan_result_type y) { x += y; }; @@ -1406,7 +1406,7 @@ TEST_CASE_TEMPLATE("inclusive scan_and_reduce with specified scratch area", Inpu // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; static __shared__ scan_result_type scratch[kat::warp_size]; // assumes that there are no than warp_size warps per block const auto plus = [](scan_result_type& x, scan_result_type y) { x += y; }; @@ -1482,7 +1482,7 @@ TEST_CASE_TEMPLATE("exclusive scan_and_reduce without specified scratch area", I // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; const auto plus = [](scan_result_type& x, scan_result_type y) { x += y; }; checked_value_type result; @@ -1548,7 +1548,7 @@ TEST_CASE("elementwise accumulate_n") const input_value_type* __restrict input_src ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto block_result = result + length_to_cover_per_block * gi::block::id(); auto block_dest = input_dest + length_to_cover_per_block * gi::block::id(); klcb::copy_n(block_dest, length_to_cover_per_block, block_result); @@ -1602,7 +1602,7 @@ TEST_CASE("elementwise accumulate") const input_value_type* __restrict input_src ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto block_result = result + length_to_cover_per_block * gi::block::id(); auto block_dest = input_dest + length_to_cover_per_block * gi::block::id(); klcb::copy_n(block_dest, length_to_cover_per_block, block_result); @@ -1669,7 +1669,7 @@ TEST_CASE("reduce") // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; // printf("Warp %u Lane %2u input is %4d\n", (unsigned) gi::warp::global_id(), (unsigned) gi::lane::id(), (int) thread_input); const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; @@ -1739,7 +1739,7 @@ TEST_CASE("sum") // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; auto warp_sum = kcw::sum(thread_input); target[gi::thread::global_id()] = warp_sum; @@ -1806,7 +1806,7 @@ TEST_CASE("inclusive scan") // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; auto warp_scan_result = kcw::scan(thread_input, plus); @@ -1873,7 +1873,7 @@ TEST_CASE("exclusive scan") // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; const auto plus = [](checked_value_type& x, checked_value_type y) { x += y; }; auto warp_scan_result = kcw::scan(thread_input, plus); @@ -1940,7 +1940,7 @@ TEST_CASE("exclusive_prefix_sum") // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; auto warp_exclusive_prefix_sum = kcw::exclusive_prefix_sum(thread_input); target[gi::thread::global_id()] = warp_exclusive_prefix_sum; @@ -2005,7 +2005,7 @@ TEST_CASE("prefix_sum") { // Note: Every thread will set a target value, but there is still just one reduction result // per block. In this variant of reduce, all block threads must obtain the result. - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_input = input[gi::thread::global_id()]; auto warp_prefix_sum = kcw::prefix_sum(thread_input); target[gi::thread::global_id()] = warp_prefix_sum; @@ -2059,7 +2059,7 @@ TEST_CASE("cast_and_copy_n") const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_warp * gi::warp::global_id(); auto warp_target_start = target + length_to_cover_per_warp * gi::warp::global_id(); kcw::cast_and_copy_n(source_start, length_to_cover_per_warp, warp_target_start); @@ -2102,7 +2102,7 @@ TEST_CASE("cast_and_copy") const input_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_warp * gi::warp::global_id(); auto source_end = source_start + length_to_cover_per_warp; auto warp_target_start = target + length_to_cover_per_warp * gi::warp::global_id(); @@ -2145,7 +2145,7 @@ TEST_CASE("copy_n") const checked_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_warp * gi::warp::global_id(); auto warp_target_start = target + length_to_cover_per_warp * gi::warp::global_id(); kcw::copy_n(source_start, length_to_cover_per_warp, warp_target_start); @@ -2187,7 +2187,7 @@ TEST_CASE("copy") const checked_value_type* __restrict input ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto source_start = input + length_to_cover_per_warp * gi::warp::global_id(); auto source_end = source_start + length_to_cover_per_warp; auto warp_target_start = target + length_to_cover_per_warp * gi::warp::global_id(); @@ -2228,7 +2228,7 @@ TEST_CASE("fill") size_t, checked_value_type* buffer_to_fill_by_entire_grid ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = buffer_to_fill_by_entire_grid + length_to_cover_per_warp * gi::warp::global_id(); auto end = start + length_to_cover_per_warp; auto fill_value = resolve_fill_value(gi::warp::global_id()); @@ -2268,7 +2268,7 @@ TEST_CASE("fill_n") { size_t, checked_value_type* buffer_to_fill_by_entire_grid ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto start = buffer_to_fill_by_entire_grid + length_to_cover_per_warp * gi::warp::global_id(); auto fill_value = resolve_fill_value(gi::warp::global_id()); kcw::fill_n(start, length_to_cover_per_warp, fill_value); @@ -2316,7 +2316,7 @@ TEST_CASE("lookup") const index_type* __restrict indices ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto warp_indices_start = indices + num_indices_per_warp * gi::warp::global_id(); auto warp_target_start = target + num_indices_per_warp * gi::warp::global_id(); kcw::lookup(warp_target_start, data, warp_indices_start, num_indices_per_warp); @@ -2367,7 +2367,7 @@ TEST_CASE("elementwise accumulate_n") const input_value_type* __restrict input_src ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto warp_result = result + length_to_cover_per_warp * gi::warp::global_id(); auto warp_dest = input_dest + length_to_cover_per_warp * gi::warp::global_id(); kcw::copy_n(warp_dest, length_to_cover_per_warp, warp_result); @@ -2422,7 +2422,7 @@ TEST_CASE("elementwise accumulate") const input_value_type* __restrict input_src ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto warp_result = result + length_to_cover_per_warp * gi::warp::global_id(); auto warp_dest = input_dest + length_to_cover_per_warp * gi::warp::global_id(); kcw::copy_n(warp_dest, length_to_cover_per_warp, warp_result); diff --git a/tests/warp_collaboration.cu b/tests/warp_collaboration.cu index 9cf8f78..75cfd1f 100644 --- a/tests/warp_collaboration.cu +++ b/tests/warp_collaboration.cu @@ -299,7 +299,7 @@ TEST_CASE("barrier") { ) { __shared__ int shared_array[warp_size]; - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; shared_array[gi::lane::id()] = 0; @@ -357,7 +357,7 @@ TEST_CASE("all_lanes_satisfy") { predicate_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::all_lanes_satisfy(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -411,7 +411,7 @@ TEST_CASE("no_lanes_satisfy") { predicate_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::no_lanes_satisfy(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -464,7 +464,7 @@ TEST_CASE("all_lanes_agree_on") { predicate_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::all_lanes_agree_on(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -522,7 +522,7 @@ TEST_CASE("some_lanes_satisfy") { predicate_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::some_lanes_satisfy(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -575,7 +575,7 @@ TEST_CASE("num_lanes_agreeing_on") { predicate_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::num_lanes_agreeing_on(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -629,7 +629,7 @@ TEST_CASE("majority_vote") { predicate_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::majority_vote(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -690,7 +690,7 @@ TEST_CASE("in_unique_lane_with") { predicate_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::in_unique_lane_with(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -744,7 +744,7 @@ TEST_CASE("get_from_lane") { auto testcase_device_function = [=] KAT_DEV (size_t, checked_value_type* thread_obtained_values) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::thread::id()) }; auto source_lane { make_source_lane(gi::thread::id()) }; auto obtained_value { kcw::get_from_lane(thread_value, source_lane) }; @@ -787,7 +787,7 @@ TEST_CASE("get_from_first_lane") { auto testcase_device_function = [=] KAT_DEV (size_t, checked_value_type* thread_obtained_values) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::thread::id()) }; auto obtained_value { kcw::get_from_first_lane(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -829,7 +829,7 @@ TEST_CASE("get_from_last_lane") { auto testcase_device_function = [=] KAT_DEV (size_t, checked_value_type* thread_obtained_values) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::thread::id()) }; auto obtained_value { kcw::get_from_last_lane(thread_value) }; thread_obtained_values[gi::thread::global_id()] = obtained_value; @@ -876,7 +876,7 @@ TEST_CASE("have_a_single_lane_compute") { auto testcase_device_function = [=] KAT_DEV (size_t, checked_value_type* thread_obtained_values) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto source_lane { make_source_lane(gi::thread::id()) }; auto obtained_value = kcw::have_a_single_lane_compute( @@ -922,7 +922,7 @@ TEST_CASE("have_first_lane_compute") { auto testcase_device_function = [=] KAT_DEV (size_t, checked_value_type* thread_obtained_values) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto obtained_value = kcw::have_first_lane_compute( [=]() { return make_thread_value(gi::thread::id()); } @@ -965,7 +965,7 @@ TEST_CASE("have_last_lane_compute") { auto testcase_device_function = [=] KAT_DEV (size_t, checked_value_type* thread_obtained_values) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto obtained_value = kcw::have_last_lane_compute( [=]() { return make_thread_value(gi::thread::id()); } @@ -1017,7 +1017,7 @@ TEST_CASE("first_lane_satisfying") { checked_value_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto thread_value { make_thread_value(gi::warp::id_in_block(), gi::lane::id()) }; auto obtained_value { kcw::first_lane_satisfying(thread_value) }; // if (threadIdx.x >= 64) printf("Thread %u value %u obtained %u\n", gi::thread::global_id(), thread_value, obtained_value); @@ -1075,7 +1075,7 @@ TEST_CASE("get_active_lanes") { checked_value_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; bool should_stay_active { determine_whether_to_stay_active(gi::warp::id_in_block(), gi::lane::id()) }; // if (threadIdx.x < 32) // printf("Thread %u %s stay active\n", gi::thread::id(), (should_stay_active ? "SHOULD" : "SHOULD NOT")); @@ -1145,7 +1145,7 @@ TEST_CASE("num_active_lanes") { checked_value_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; bool should_stay_active { determine_whether_to_stay_active(gi::warp::id_in_block(), gi::lane::id()) }; if (not should_stay_active) { thread_obtained_values[gi::thread::global_id()] = invalid_num_active_lanes; @@ -1210,7 +1210,7 @@ TEST_CASE("am_leader_lane") { checked_value_type* thread_obtained_values ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; bool should_stay_active { determine_whether_to_stay_active(gi::warp::id_in_block(), gi::lane::id()) }; if (not should_stay_active) { thread_obtained_values[gi::thread::global_id()] = false; @@ -1291,7 +1291,7 @@ TEST_CASE("index_among_active_lanes") { checked_value_type* thread_obtained_values ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; bool should_stay_active { determine_whether_to_stay_active(gi::warp::id_in_block(), gi::lane::id()) }; if (not should_stay_active) { thread_obtained_values[gi::thread::global_id()] = invalid_index; @@ -1347,7 +1347,7 @@ TEST_CASE("at_warp_stride") checked_value_type* pos_attendent_thread_indices ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto offset_into_attendant_array = length_to_cover_per_warp * gi::warp::id_in_block(); auto f_inner = [&] (size_t pos) { pos_attendent_thread_indices[offset_into_attendant_array + pos] = gi::thread::id_in_grid(); @@ -1407,7 +1407,7 @@ TEST_CASE("active_lanes_atomically_increment") checked_value_type* thread_values_before_increment ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; if (not determine_whether_to_stay_active(gi::warp::id_in_block(), gi::lane::id() )) { thread_values_before_increment[gi::thread::global_id()] = invalid_value_for_inactives; return; @@ -1472,7 +1472,7 @@ TEST_CASE("at_warp_stride") checked_value_type* pos_attendent_thread_indices ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto offset_into_attendant_array = length_to_cover_per_warp * gi::warp::id_in_block(); auto f_inner = [&] (size_t pos) { pos_attendent_thread_indices[offset_into_attendant_array + pos] = gi::thread::id_in_grid(); @@ -1533,7 +1533,7 @@ TEST_CASE("multisearch") { checked_value_type* search_results ) { - namespace gi = kat::grid_info; + namespace gi = kat::linear_grid; auto haystack_straw = make_thread_value(gi::warp::id_in_block(), gi::lane::id()); thread_value_type needle_to_search_for = make_search_value(gi::warp::id_in_block(), gi::lane::id()); auto search_result = klcw::multisearch(needle_to_search_for, haystack_straw); @@ -1612,7 +1612,7 @@ TEST_CASE_TEMPLATE ("compute_predicate_at_warp_stride", SlackSetting, checked_value_type* computed_predicate ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto instrumented_pred = [=] (size_t pos) -> bool { device_side_pos_attendants_raw[gi::warp::global_id() * length_to_cover_per_warp + pos] = gi::thread::id_in_grid(); return pred(pos); @@ -1695,7 +1695,7 @@ TEST_CASE("merge_sorted_half_warps - in-register") checked_value_type* merged_data ) { - namespace gi = kat::linear_grid::grid_info; + namespace gi = kat::linear_grid; auto half_warp_pair_index = gi::warp::global_id(); // Each warp gets a different pair of half-warps to merge auto first_half_warp_index = half_warp_pair_index / num_half_warps; auto second_half_warp_index = half_warp_pair_index % num_half_warps;