From 787a122ece8adfb911cf055bac7dedb47a164a91 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Fri, 10 Nov 2023 17:42:13 +0000 Subject: [PATCH 01/13] work-item strided transforms --- src/portfft/committed_descriptor_impl.hpp | 276 +++++++----------- src/portfft/common/logging.hpp | 6 +- src/portfft/descriptor_validate.hpp | 271 +++++++++++++++++ src/portfft/dispatcher/global_dispatcher.hpp | 7 +- .../dispatcher/subgroup_dispatcher.hpp | 4 +- .../dispatcher/workgroup_dispatcher.hpp | 9 +- .../dispatcher/workitem_dispatcher.hpp | 150 +++++++--- src/portfft/enums.hpp | 26 ++ src/portfft/specialization_constant.hpp | 12 +- src/portfft/utils.hpp | 64 ++++ test/common/reference_data_wrangler.hpp | 183 ++++++------ test/unit_test/fft_test_utils.hpp | 99 +++++-- test/unit_test/instantiate_fft_tests.hpp | 129 +++++++- 13 files changed, 889 insertions(+), 347 deletions(-) create mode 100644 src/portfft/descriptor_validate.hpp diff --git a/src/portfft/committed_descriptor_impl.hpp b/src/portfft/committed_descriptor_impl.hpp index 0a6470ea..1f472df8 100644 --- a/src/portfft/committed_descriptor_impl.hpp +++ b/src/portfft/committed_descriptor_impl.hpp @@ -32,6 +32,7 @@ #include "common/exceptions.hpp" #include "common/subgroup.hpp" #include "defines.hpp" +#include "descriptor_validate.hpp" #include "enums.hpp" #include "specialization_constant.hpp" #include "utils.hpp" @@ -76,69 +77,6 @@ class global_kernel; template class transpose_kernel; -/** - * Return the default strides for a given dft size - * - * @param lengths the dimensions of the dft - */ -inline std::vector get_default_strides(const std::vector& lengths) { - PORTFFT_LOG_FUNCTION_ENTRY(); - std::vector strides(lengths.size()); - std::size_t total_size = 1; - for (std::size_t i_plus1 = lengths.size(); i_plus1 > 0; i_plus1--) { - std::size_t i = i_plus1 - 1; - strides[i] = total_size; - total_size *= lengths[i]; - } - PORTFFT_LOG_TRACE("Default strides:", strides); - return strides; -} - -/** - * Return whether the given descriptor has default strides and distance for a given direction - * - * @tparam Descriptor Descriptor type - * @param desc Descriptor to check - * @param dir Direction - */ -template -bool has_default_strides_and_distance(const Descriptor& desc, direction dir) { - const auto default_strides = get_default_strides(desc.lengths); - const auto default_distance = desc.get_flattened_length(); - return desc.get_strides(dir) == default_strides && desc.get_distance(dir) == default_distance; -} - -/** - * Return whether the given descriptor has strides and distance consistent with the batch interleaved layout - * - * @tparam Descriptor Descriptor type - * @param desc Descriptor to check - * @param dir Direction - */ -template -bool is_batch_interleaved(const Descriptor& desc, direction dir) { - return desc.lengths.size() == 1 && desc.get_distance(dir) == 1 && - desc.get_strides(dir).back() == desc.number_of_transforms; -} - -/** - * Return an enum describing the layout of the data in the descriptor - * - * @tparam Descriptor Descriptor type - * @param desc Descriptor to check - * @param dir Direction - */ -template -detail::layout get_layout(const Descriptor& desc, direction dir) { - if (has_default_strides_and_distance(desc, dir)) { - return detail::layout::PACKED; - } - if (is_batch_interleaved(desc, dir)) { - return detail::layout::BATCH_INTERLEAVED; - } - return detail::layout::UNPACKED; -} - /** * A committed descriptor that contains everything that is needed to run FFT. * @@ -435,21 +373,22 @@ class committed_descriptor_impl { * @param num_factors total number of factors of the committed size, set as a spec constant */ void set_spec_constants(detail::level top_level, sycl::kernel_bundle& in_bundle, - std::size_t length, const std::vector& factors, - detail::elementwise_multiply multiply_on_load, detail::elementwise_multiply multiply_on_store, + Idx length, const std::vector& factors, detail::elementwise_multiply multiply_on_load, + detail::elementwise_multiply multiply_on_store, detail::apply_scale_factor scale_factor_applied, detail::level level, detail::complex_conjugate conjugate_on_load, detail::complex_conjugate conjugate_on_store, - Scalar scale_factor, Idx factor_num = 0, Idx num_factors = 0) { + Scalar scale_factor, IdxGlobal input_stride, IdxGlobal output_stride, + IdxGlobal input_distance, IdxGlobal output_distance, Idx factor_num = 0, + Idx num_factors = 0) { PORTFFT_LOG_FUNCTION_ENTRY(); - const Idx length_idx = static_cast(length); // These spec constants are used in all implementations, so we set them here PORTFFT_LOG_TRACE("Setting specialization constants:"); PORTFFT_LOG_TRACE("SpecConstComplexStorage:", params.complex_storage); in_bundle.template set_specialization_constant(params.complex_storage); - PORTFFT_LOG_TRACE("SpecConstNumRealsPerFFT:", 2 * length_idx); - in_bundle.template set_specialization_constant(2 * length_idx); - PORTFFT_LOG_TRACE("SpecConstWIScratchSize:", 2 * detail::wi_temps(length_idx)); - in_bundle.template set_specialization_constant(2 * detail::wi_temps(length_idx)); + PORTFFT_LOG_TRACE("SpecConstNumRealsPerFFT:", 2 * length); + in_bundle.template set_specialization_constant(2 * length); + PORTFFT_LOG_TRACE("SpecConstWIScratchSize:", 2 * detail::wi_temps(length)); + in_bundle.template set_specialization_constant(2 * detail::wi_temps(length)); PORTFFT_LOG_TRACE("SpecConstMultiplyOnLoad:", multiply_on_load); in_bundle.template set_specialization_constant(multiply_on_load); PORTFFT_LOG_TRACE("SpecConstMultiplyOnStore:", multiply_on_store); @@ -462,7 +401,14 @@ class committed_descriptor_impl { in_bundle.template set_specialization_constant(conjugate_on_store); PORTFFT_LOG_TRACE("get_spec_constant_scale:", scale_factor); in_bundle.template set_specialization_constant()>(scale_factor); - + PORTFFT_LOG_TRACE("SpecConstInputStride:", input_stride); + in_bundle.template set_specialization_constant(input_stride); + PORTFFT_LOG_TRACE("SpecConstOutputStride:", output_stride); + in_bundle.template set_specialization_constant(output_stride); + PORTFFT_LOG_TRACE("SpecConstInputDistance:", input_distance); + in_bundle.template set_specialization_constant(input_distance); + PORTFFT_LOG_TRACE("SpecConstOutputDistance:", output_distance); + in_bundle.template set_specialization_constant(output_distance); dispatch(top_level, in_bundle, length, factors, level, factor_num, num_factors); } @@ -537,53 +483,74 @@ class committed_descriptor_impl { std::optional> set_spec_constants_driver(detail::level top_level, kernel_ids_and_metadata_t& prepared_vec, direction compute_direction, - std::size_t dimension_num, - bool skip_scaling) { + std::size_t dimension_num) { Scalar scale_factor = compute_direction == direction::FORWARD ? params.forward_scale : params.backward_scale; - detail::apply_scale_factor scale_factor_applied = detail::apply_scale_factor::APPLIED; - bool is_compatible = true; - if (skip_scaling) { - scale_factor_applied = detail::apply_scale_factor::NOT_APPLIED; - } std::size_t counter = 0; - auto conjugate_on_load = detail::complex_conjugate::NOT_APPLIED; - auto conjugate_on_store = detail::complex_conjugate::NOT_APPLIED; + IdxGlobal remaining_factors_prod = static_cast(params.get_flattened_length()); std::vector result; - for (auto& [level, ids, factors] : prepared_vec) { - auto in_bundle = sycl::get_kernel_bundle(queue.get_context(), ids); - if (top_level == detail::level::GLOBAL) { - std::size_t factor_size = - static_cast(std::accumulate(factors.begin(), factors.end(), Idx(1), std::multiplies())); - if (counter == prepared_vec.size() - 1) { - if (compute_direction == direction::BACKWARD) { - conjugate_on_store = detail::complex_conjugate::APPLIED; + for (auto [level, ids, factors] : prepared_vec) { + const bool is_multi_dim = params.lengths.size() > 1; + const bool is_global = top_level == detail::level::GLOBAL; + const bool is_final_factor = counter == (prepared_vec.size() - 1); + const bool is_final_dim = dimension_num == (params.lengths.size() - 1); + const bool is_backward = compute_direction == direction::BACKWARD; + if (is_multi_dim && is_global) { + throw unsupported_configuration("multidimensional global transforms are not supported."); + } + + const auto multiply_on_store = is_global && !is_final_factor ? detail::elementwise_multiply::APPLIED + : detail::elementwise_multiply::NOT_APPLIED; + const auto conjugate_on_load = + is_backward && counter == 0 ? detail::complex_conjugate::APPLIED : detail::complex_conjugate::NOT_APPLIED; + const auto conjugate_on_store = + is_backward && is_final_factor ? detail::complex_conjugate::APPLIED : detail::complex_conjugate::NOT_APPLIED; + const auto apply_scale = is_final_factor && is_final_dim ? detail::apply_scale_factor::APPLIED + : detail::apply_scale_factor::NOT_APPLIED; + + Idx length{}; + IdxGlobal forward_stride{}; + IdxGlobal backward_stride{}; + IdxGlobal forward_distance{}; + IdxGlobal backward_distance{}; + + if (is_global) { + length = std::accumulate(factors.begin(), factors.end(), Idx(1), std::multiplies()); + + remaining_factors_prod /= length; + forward_stride = remaining_factors_prod; + backward_stride = remaining_factors_prod; + forward_distance = is_final_factor ? length : 1; + backward_distance = is_final_factor ? length : 1; + + } else { + length = static_cast(params.lengths[dimension_num]); + forward_stride = static_cast(params.forward_strides[dimension_num]); + backward_stride = static_cast(params.backward_strides[dimension_num]); + if (is_multi_dim) { + if (is_final_dim) { + forward_distance = length; + backward_distance = length; + } else { + forward_distance = 1; + backward_distance = 1; } - set_spec_constants(detail::level::GLOBAL, in_bundle, factor_size, factors, - detail::elementwise_multiply::NOT_APPLIED, detail::elementwise_multiply::NOT_APPLIED, - detail::apply_scale_factor::APPLIED, level, conjugate_on_load, conjugate_on_store, - scale_factor, static_cast(counter), static_cast(prepared_vec.size())); - // reset conjugate_on_store - conjugate_on_store = detail::complex_conjugate::NOT_APPLIED; } else { - if (counter == 0 && compute_direction == direction::BACKWARD) { - conjugate_on_load = detail::complex_conjugate::APPLIED; - } - set_spec_constants(detail::level::GLOBAL, in_bundle, factor_size, factors, - detail::elementwise_multiply::NOT_APPLIED, detail::elementwise_multiply::APPLIED, - detail::apply_scale_factor::NOT_APPLIED, level, conjugate_on_load, conjugate_on_store, - scale_factor, static_cast(counter), static_cast(prepared_vec.size())); - // reset conjugate_on_load - conjugate_on_load = detail::complex_conjugate::NOT_APPLIED; - } - } else { - if (compute_direction == direction::BACKWARD) { - conjugate_on_load = detail::complex_conjugate::APPLIED; - conjugate_on_store = detail::complex_conjugate::APPLIED; + forward_distance = static_cast(params.forward_distance); + backward_distance = static_cast(params.backward_distance); } - set_spec_constants(level, in_bundle, params.lengths[dimension_num], factors, - detail::elementwise_multiply::NOT_APPLIED, detail::elementwise_multiply::NOT_APPLIED, - scale_factor_applied, level, conjugate_on_load, conjugate_on_store, scale_factor); } + + const IdxGlobal input_stride = compute_direction == direction::FORWARD ? forward_stride : backward_stride; + const IdxGlobal output_stride = compute_direction == direction::FORWARD ? backward_stride : forward_stride; + const IdxGlobal input_distance = compute_direction == direction::FORWARD ? forward_distance : backward_distance; + const IdxGlobal output_distance = compute_direction == direction::FORWARD ? backward_distance : forward_distance; + + auto in_bundle = sycl::get_kernel_bundle(queue.get_context(), ids); + + set_spec_constants(top_level, in_bundle, length, factors, detail::elementwise_multiply::NOT_APPLIED, + multiply_on_store, apply_scale, level, conjugate_on_load, conjugate_on_store, scale_factor, + input_stride, output_stride, input_distance, output_distance, static_cast(counter), + static_cast(prepared_vec.size())); try { PORTFFT_LOG_TRACE("Building kernel bundle with subgroup size", SubgroupSize); result.emplace_back(sycl::build(in_bundle), factors, params.lengths[dimension_num], SubgroupSize, @@ -591,15 +558,11 @@ class committed_descriptor_impl { PORTFFT_LOG_TRACE("Kernel bundle build complete."); } catch (std::exception& e) { PORTFFT_LOG_WARNING("Build for subgroup size", SubgroupSize, "failed with message:\n", e.what()); - is_compatible = false; - break; + return std::nullopt; } counter++; } - if (is_compatible) { - return result; - } - return std::nullopt; + return result; } /** @@ -613,7 +576,7 @@ class committed_descriptor_impl { * @return `dimension_struct` for the newly built kernels */ template - dimension_struct build_w_spec_const(std::size_t dimension_num, bool skip_scaling) { + dimension_struct build_w_spec_const(std::size_t dimension_num) { PORTFFT_LOG_FUNCTION_ENTRY(); if (std::count(supported_sg_sizes.begin(), supported_sg_sizes.end(), SubgroupSize)) { auto [top_level, prepared_vec] = prepare_implementation(dimension_num); @@ -626,10 +589,10 @@ class committed_descriptor_impl { } if (is_compatible) { - auto forward_kernels = set_spec_constants_driver(top_level, prepared_vec, direction::FORWARD, - dimension_num, skip_scaling); - auto backward_kernels = set_spec_constants_driver(top_level, prepared_vec, direction::BACKWARD, - dimension_num, skip_scaling); + auto forward_kernels = + set_spec_constants_driver(top_level, prepared_vec, direction::FORWARD, dimension_num); + auto backward_kernels = + set_spec_constants_driver(top_level, prepared_vec, direction::BACKWARD, dimension_num); if (forward_kernels.has_value() && backward_kernels.has_value()) { return {forward_kernels.value(), backward_kernels.value(), top_level, params.lengths[dimension_num], SubgroupSize}; @@ -637,9 +600,9 @@ class committed_descriptor_impl { } } if constexpr (sizeof...(OtherSGSizes) == 0) { - throw invalid_configuration("None of the compiled subgroup sizes are supported by the device"); + throw unsupported_configuration("None of the compiled subgroup sizes are supported by the device"); } else { - return build_w_spec_const(dimension_num, skip_scaling); + return build_w_spec_const(dimension_num); } } @@ -801,32 +764,12 @@ class committed_descriptor_impl { PORTFFT_LOG_TRACE("local_memory_size:", local_memory_size); PORTFFT_LOG_TRACE("llc_size:", llc_size); - // check it's suitable to run - const auto forward_layout = detail::get_layout(params, direction::FORWARD); - const auto backward_layout = detail::get_layout(params, direction::BACKWARD); - if (params.lengths.size() > 1) { - const bool supported_layout = - forward_layout == detail::layout::PACKED && backward_layout == detail::layout::PACKED; - if (!supported_layout) { - throw unsupported_configuration("Multi-dimensional transforms are only supported with default data layout"); - } - } else { - const bool supported_layout = - (forward_layout == detail::layout::PACKED || forward_layout == detail::layout::BATCH_INTERLEAVED) && - (backward_layout == detail::layout::PACKED || backward_layout == detail::layout::BATCH_INTERLEAVED); - if (!supported_layout) { - throw unsupported_configuration("Arbitary strides are not supported"); - } - } + detail::validate::validate_descriptor(params); // compile the kernels and precalculate twiddles std::size_t n_kernels = params.lengths.size(); for (std::size_t i = 0; i < n_kernels; i++) { - bool skip_scaling = true; - if (i == n_kernels - 1) { - skip_scaling = false; - } - dimensions.emplace_back(build_w_spec_const(i, skip_scaling)); + dimensions.emplace_back(build_w_spec_const(i)); dimensions.back().forward_kernels.at(0).twiddles_forward = std::shared_ptr( calculate_twiddles(dimensions.back().level, dimensions.at(i), dimensions.back().forward_kernels), [queue](Scalar* ptr) { @@ -845,25 +788,17 @@ class committed_descriptor_impl { }); } - bool is_scratch_required = false; - Idx num_global_level_dimensions = 0; - for (std::size_t i = 0; i < n_kernels; i++) { - if (dimensions.at(i).level == detail::level::GLOBAL) { - is_scratch_required = true; - num_global_level_dimensions++; - } - } + Idx num_global_level_dimensions = static_cast(std::count_if( + dimensions.cbegin(), dimensions.cend(), [](auto& d) { return d.level == detail::level::GLOBAL; })); if (num_global_level_dimensions != 0) { if (params.lengths.size() > 1) { - throw unsupported_configuration("Only 1D FFTs that do not fit in local memory are supported"); + throw unsupported_configuration("For FFTs that do not fit in local memory only 1D is supported"); } if (params.get_distance(direction::FORWARD) != params.lengths[0] || params.get_distance(direction::BACKWARD) != params.lengths[0]) { throw unsupported_configuration("Large FFTs are currently only supported in non-strided format"); } - } - if (is_scratch_required) { allocate_scratch_and_precompute_scan(num_global_level_dimensions); } } @@ -1031,20 +966,12 @@ class committed_descriptor_impl { std::size_t outer_size = total_size / params.lengths.back(); std::size_t input_stride_0 = input_strides.back(); std::size_t output_stride_0 = output_strides.back(); - // distances are currently used just in the first dimension - these changes are meant for that one - // TODO fix this to support non-default layouts - if (input_stride_0 < input_distance) { // for example: batch interleaved input - input_distance = params.lengths.back(); - } - if (output_stride_0 < output_distance) { // for example: batch interleaved output - output_distance = params.lengths.back(); - } PORTFFT_LOG_TRACE("Dispatching the kernel for the last dimension"); sycl::event previous_event = dispatch_kernel_1d(in, out, in_imag, out_imag, dependencies, params.number_of_transforms * outer_size, - input_stride_0, output_stride_0, input_distance, output_distance, input_offset, - output_offset, dimensions.back(), compute_direction); + input_stride_0, output_stride_0, input_distance / outer_size, output_distance / outer_size, + input_offset, output_offset, dimensions.back(), compute_direction); if (n_dimensions == 1) { return previous_event; } @@ -1144,8 +1071,6 @@ class committed_descriptor_impl { direction compute_direction) { PORTFFT_LOG_FUNCTION_ENTRY(); if (SubgroupSize == dimension_data.used_sg_size) { - const bool input_packed = input_distance == dimension_data.length && input_stride == 1; - const bool output_packed = output_distance == dimension_data.length && output_stride == 1; const bool input_batch_interleaved = input_distance == 1 && input_stride == n_transforms; const bool output_batch_interleaved = output_distance == 1 && output_stride == n_transforms; for (kernel_data_struct kernel_data : dimension_data.forward_kernels) { @@ -1164,17 +1089,20 @@ class committed_descriptor_impl { } } } - if (input_packed && output_packed) { + + // UNPACKED layout is also being dispatched as PACKED layout + const bool is_in_place = in == out; + if (!input_batch_interleaved && !output_batch_interleaved) { return run_kernel( in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, compute_direction); } - if (input_batch_interleaved && output_packed && in != out) { + if (input_batch_interleaved && !output_batch_interleaved && !is_in_place) { return run_kernel( in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, compute_direction); } - if (input_packed && output_batch_interleaved && in != out) { + if (!input_batch_interleaved && output_batch_interleaved && !is_in_place) { return run_kernel( in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, compute_direction); @@ -1184,7 +1112,7 @@ class committed_descriptor_impl { in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, compute_direction); } - throw unsupported_configuration("Only PACKED or BATCH_INTERLEAVED transforms are supported"); + throw internal_error("None of the run_kernel functions match the description."); } if constexpr (sizeof...(OtherSGSizes) == 0) { throw invalid_configuration("None of the compiled subgroup sizes are supported by the device!"); diff --git a/src/portfft/common/logging.hpp b/src/portfft/common/logging.hpp index 6a12ab8d..1e9b80c3 100644 --- a/src/portfft/common/logging.hpp +++ b/src/portfft/common/logging.hpp @@ -52,10 +52,10 @@ struct logging_config { } #endif } - char* log_trace_str = getenv("PORTFFT_LOG_TRACE"); + char* log_trace_str = getenv("PORTFFT_LOG_TRACES"); if (log_trace_str != nullptr) { log_trace = static_cast(atoi(log_trace_str)); -#ifndef PORTFFT_LOG_TRACE +#ifndef PORTFFT_LOG_TRACES if (log_trace) { std::cerr << "Can not enable logging of traces if it is disabled at compile time." << std::endl; } @@ -281,7 +281,7 @@ struct global_data_struct { */ template PORTFFT_INLINE void log_message_global([[maybe_unused]] Ts... messages) { -#ifdef PORTFFT_LOG_TRACE +#ifdef PORTFFT_LOG_TRACES if (global_logging_config.log_trace && it.get_global_id(0) == 0) { log_message_impl(messages...); } diff --git a/src/portfft/descriptor_validate.hpp b/src/portfft/descriptor_validate.hpp new file mode 100644 index 00000000..3979885d --- /dev/null +++ b/src/portfft/descriptor_validate.hpp @@ -0,0 +1,271 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Codeplay's portFFT + * + **************************************************************************/ + +#ifndef PORTFFT_DESCRIPTOR_VALIDATE_HPP +#define PORTFFT_DESCRIPTOR_VALIDATE_HPP + +#include + +#include "common/exceptions.hpp" +#include "enums.hpp" +#include "utils.hpp" + +namespace portfft::detail::validate { + +namespace detail { + +/** + * Throw an exception if the lengths are invalid when looked at in isolation. + * + * @param lengths the dimensions of the tranform + */ +inline void validate_lengths(const std::vector& lengths) { + if (lengths.empty()) { + throw invalid_configuration("Invalid lengths, must have at least 1 dimension"); + } + for (std::size_t i = 0; i < lengths.size(); ++i) { + if (lengths[i] == 0) { + throw invalid_configuration("Invalid lengths[", i, "]=", lengths[i], ", must be positive"); + } + } +} + +/** + * Throw an exception if the layout is unsupported. + * + * @param lengths the dimensions of the tranform + * @param forward_layout the layout of the forward domain + * @param backward_layout the layout of the backward domain + */ +inline void validate_layout(const std::vector& lengths, portfft::detail::layout forward_layout, + portfft::detail::layout backward_layout) { + if (lengths.size() > 1) { + const bool supported_layout = + forward_layout == portfft::detail::layout::PACKED && backward_layout == portfft::detail::layout::PACKED; + if (!supported_layout) { + throw unsupported_configuration("Multi-dimensional transforms are only supported with default data layout"); + } + } +} + +/** + * Throw an exception if individual stride, distance and number_of_transforms values are invalid/inconsistent. + * + * @param lengths the dimensions of the tranform + * @param number_of_tranforms the number of batches + * @param strides the strides between elements in a domain + * @param distance the distance between batches in a domain + * @param domain_str a string with the name of the domain being validated + */ +inline void validate_strides_distance_basic(const std::vector& lengths, std::size_t number_of_transforms, + const std::vector& strides, std::size_t distance, + const std::string_view domain_str) { + // Validate stride + std::size_t expected_num_strides = lengths.size(); + if (strides.size() != expected_num_strides) { + throw invalid_configuration("Mismatching ", domain_str, " strides length got ", strides.size(), " expected ", + expected_num_strides); + } + for (std::size_t i = 0; i < strides.size(); ++i) { + if (strides[i] == 0) { + throw invalid_configuration("Invalid ", domain_str, " stride[", i, "]=", strides[i], ", must be positive"); + } + } + + // Validate distance + if (number_of_transforms > 1 && distance == 0) { + throw invalid_configuration("Invalid ", domain_str, " distance ", distance, ", must be positive for batched FFTs"); + } +} + +/** + * For multidimensional transforms, check that the strides are large enough so there will not be overlap within a single + * batch. Throw when the strides are not big enough. This accounts for layouts like batch interleaved. + * + * @param lengths the dimensions of the tranform + * @param number_of_tranforms the number of batches + * @param strides the strides between elements in a domain + * @param distance the distance between batches in a domain + * @param domain_str a string with the name of the domain being validated + */ +inline void strides_distance_multidim_check(const std::vector& lengths, std::size_t number_of_transforms, + const std::vector& strides, std::size_t distance, + const std::string_view domain_str) { + // Quick check for most common configurations. + // This check has some false-negative for some impractical configurations. + // View the output data as a N+1 dimensional tensor for a N-dimension FFT: the number of batch is just another + // dimension with a stride of 'distance'. This sorts the dimensions from fastest moving (inner-most) to slowest + // moving (outer-most) and check that the stride of a dimension is large enough to avoid overlapping the previous + // dimension. + std::vector generic_strides = strides; + std::vector generic_sizes = lengths; + if (number_of_transforms > 1) { + generic_strides.push_back(distance); + generic_sizes.push_back(number_of_transforms); + } + std::vector indices(generic_sizes.size()); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), + [&](std::size_t a, std::size_t b) { return generic_strides[a] < generic_strides[b]; }); + + for (std::size_t i = 1; i < indices.size(); ++i) { + bool fits_in_next_dim = + generic_strides[indices[i - 1]] * generic_sizes[indices[i - 1]] <= generic_strides[indices[i]]; + if (!fits_in_next_dim) { + throw invalid_configuration("Domain ", domain_str, + ": multi-dimension strides are not large enough to avoid overlap"); + } + } +} + +/** + * Check that batches of 1D FFTs don't overlap. + * + * @param lengths the dimensions of the tranform + * @param number_of_tranforms the number of batches + * @param strides the strides between elements in a domain + * @param distance the distance between batches in a domain + * @param domain_str a string with the name of the domain being validated + */ +inline void strides_distance_1d_check(const std::vector& lengths, std::size_t number_of_transforms, + const std::vector& strides, std::size_t distance, + const std::string_view domain_str) { + // It helps to think of the 1D transform layed out in 2D with row length of stride, that way each element of a + // transform will be contiguous down a column. + + // * If there is an index collision between batch N and batch N+M, then there will also be a collision between batch + // N-1 and batch N+M-1, so if there is any index collision, there will also be one with batch 0 (batch N-N and batch + // N+M-N). + // * If an index in a transform mod the stride of the transform is zero, then it would collide with the first batch, + // given an infinite FFT length. For all elements in a transforms, the index mod stride is the same. + // * If an element in a batch index collides with another batch, then all previous elements in that batch will also + // index collide with that batch, so we only need to check the first index of each batch. + + const std::size_t fft_size = lengths[0]; + const std::size_t stride = strides[0]; + + const std::size_t first_batch_limit = stride * fft_size; + if (first_batch_limit <= distance) { + return; + } + + for (std::size_t b = 1; b < number_of_transforms;) { + std::size_t batch_first_idx = b * distance; + auto column = batch_first_idx % stride; + if (column == 0) { // there may be a collision with the first batch + if (batch_first_idx >= first_batch_limit) { + // any further batch will only be further way + return; + } + throw invalid_configuration("Domain ", domain_str, ": batch ", b, " collides with first batch at index ", + batch_first_idx); + } + + // there won't be another collision until the column number is near to stride again, so skip a few + auto batches_until_new_column = (stride - column) / distance; + if ((stride - column) % distance != 0) { + batches_until_new_column += 1; + } + b += batches_until_new_column; + } +} + +/** + * Throw an exception if the given strides and distance are invalid for a single domain. + * + * @param lengths the dimensions of the tranform + * @param number_of_tranforms the number of batches + * @param strides the strides between elements in a domain + * @param distance the distance between batches in a domain + * @param domain_str a string with the name of the domain being validated + */ +inline void strides_distance_check(const std::vector& lengths, std::size_t number_of_transforms, + const std::vector& strides, std::size_t distance, + const std::string_view domain_str) { + validate_strides_distance_basic(lengths, number_of_transforms, strides, distance, domain_str); + if (lengths.size() > 1) { + strides_distance_multidim_check(lengths, number_of_transforms, strides, distance, domain_str); + } else { + strides_distance_1d_check(lengths, number_of_transforms, strides, distance, domain_str); + } +} + +/** + * Throw an exception if the given strides and distances are invalid for either domain. + * + * @param place where the result is written with respect to where it is read (in-place vs not in-place) + * @param lengths the dimensions of the tranform + * @param number_of_tranforms the number of batches + * @param forward_strides the strides between elements in the forward domain + * @param backward_strides the strides between elements in the backward domain + * @param forward_distance the distance between batches in the forward domain + * @param backward_distance the distance between batches in the backward domain + */ +inline void validate_strides_distance(placement place, const std::vector& lengths, + std::size_t number_of_transforms, const std::vector& forward_strides, + const std::vector& backward_strides, std::size_t forward_distance, + std::size_t backward_distance) { + if (place == placement::IN_PLACE) { + if (forward_strides != backward_strides) { + throw invalid_configuration("Invalid forward and backward strides must match for in-place configurations"); + } + if (forward_distance != backward_distance) { + throw invalid_configuration("Invalid forward and backward distances must match for in-place configurations"); + } + strides_distance_check(lengths, number_of_transforms, forward_strides, forward_distance, "forward"); + } else { + strides_distance_check(lengths, number_of_transforms, forward_strides, forward_distance, "forward"); + strides_distance_check(lengths, number_of_transforms, backward_strides, backward_distance, "backward"); + } +} +} // namespace detail + +/** + * @brief Check as much as possible if a given descriptor is valid and supported for the current capabilties of portFFT. + * @details The descriptor can still later be deemed unsupported if it is not immediately obvious. If the descriptor is + * invalid, it should be reported here or not at all. + * + * @param params the final description of the problem. + * @throws portfft::unsupported_configuration when the configuration is unsupported + * @throws portfft::invalid_configuration when the configuration is invalid e.g. would cause elements to overlap + */ +template +void validate_descriptor(const Descriptor& params) { + using namespace portfft; + + if constexpr (Descriptor::Domain == domain::REAL) { + throw unsupported_configuration("REAL domain is unsupported"); + } + + if (params.number_of_transforms == 0) { + throw invalid_configuration("Invalid number of transform ", params.number_of_transforms, ", must be positive"); + } + + detail::validate_lengths(params.lengths); + detail::validate_strides_distance(params.placement, params.lengths, params.number_of_transforms, + params.forward_strides, params.backward_strides, params.forward_distance, + params.backward_distance); + detail::validate_layout(params.lengths, portfft::detail::get_layout(params, direction::FORWARD), + portfft::detail::get_layout(params, direction::BACKWARD)); +} + +} // namespace portfft::detail::validate + +#endif diff --git a/src/portfft/dispatcher/global_dispatcher.hpp b/src/portfft/dispatcher/global_dispatcher.hpp index 3c96a36f..5dd6f56a 100644 --- a/src/portfft/dispatcher/global_dispatcher.hpp +++ b/src/portfft/dispatcher/global_dispatcher.hpp @@ -256,10 +256,9 @@ template template struct committed_descriptor_impl::set_spec_constants_struct::inner { static void execute(committed_descriptor_impl& /*desc*/, sycl::kernel_bundle& in_bundle, - std::size_t length, const std::vector& factors, detail::level level, Idx factor_num, + Idx length, const std::vector& factors, detail::level level, Idx factor_num, Idx num_factors) { PORTFFT_LOG_FUNCTION_ENTRY(); - Idx length_idx = static_cast(length); PORTFFT_LOG_TRACE("GlobalSubImplSpecConst:", level); in_bundle.template set_specialization_constant(level); PORTFFT_LOG_TRACE("GlobalSpecConstNumFactors:", num_factors); @@ -267,8 +266,8 @@ struct committed_descriptor_impl::set_spec_constants_struct::inn PORTFFT_LOG_TRACE("GlobalSpecConstLevelNum:", factor_num); in_bundle.template set_specialization_constant(factor_num); if (level == detail::level::WORKITEM || level == detail::level::WORKGROUP) { - PORTFFT_LOG_TRACE("SpecConstFftSize:", length_idx); - in_bundle.template set_specialization_constant(length_idx); + PORTFFT_LOG_TRACE("SpecConstFftSize:", length); + in_bundle.template set_specialization_constant(length); } else if (level == detail::level::SUBGROUP) { PORTFFT_LOG_TRACE("SubgroupFactorWISpecConst:", factors[1]); in_bundle.template set_specialization_constant(factors[1]); diff --git a/src/portfft/dispatcher/subgroup_dispatcher.hpp b/src/portfft/dispatcher/subgroup_dispatcher.hpp index 9a9e0c8d..700408cf 100644 --- a/src/portfft/dispatcher/subgroup_dispatcher.hpp +++ b/src/portfft/dispatcher/subgroup_dispatcher.hpp @@ -676,8 +676,8 @@ template template struct committed_descriptor_impl::set_spec_constants_struct::inner { static void execute(committed_descriptor_impl& /*desc*/, sycl::kernel_bundle& in_bundle, - std::size_t /*length*/, const std::vector& factors, detail::level /*level*/, - Idx /*factor_num*/, Idx /*num_factors*/) { + Idx /*length*/, const std::vector& factors, detail::level /*level*/, Idx /*factor_num*/, + Idx /*num_factors*/) { PORTFFT_LOG_FUNCTION_ENTRY(); PORTFFT_LOG_TRACE("SubgroupFactorWISpecConst:", factors[0]); in_bundle.template set_specialization_constant(factors[0]); diff --git a/src/portfft/dispatcher/workgroup_dispatcher.hpp b/src/portfft/dispatcher/workgroup_dispatcher.hpp index 4ed83076..bca3ca6b 100644 --- a/src/portfft/dispatcher/workgroup_dispatcher.hpp +++ b/src/portfft/dispatcher/workgroup_dispatcher.hpp @@ -344,12 +344,11 @@ template template struct committed_descriptor_impl::set_spec_constants_struct::inner { static void execute(committed_descriptor_impl& /*desc*/, sycl::kernel_bundle& in_bundle, - std::size_t length, const std::vector& /*factors*/, detail::level /*level*/, - Idx /*factor_num*/, Idx /*num_factors*/) { + Idx length, const std::vector& /*factors*/, detail::level /*level*/, Idx /*factor_num*/, + Idx /*num_factors*/) { PORTFFT_LOG_FUNCTION_ENTRY(); - const Idx length_idx = static_cast(length); - PORTFFT_LOG_TRACE("SpecConstFftSize:", length_idx); - in_bundle.template set_specialization_constant(length_idx); + PORTFFT_LOG_TRACE("SpecConstFftSize:", length); + in_bundle.template set_specialization_constant(length); } }; diff --git a/src/portfft/dispatcher/workitem_dispatcher.hpp b/src/portfft/dispatcher/workitem_dispatcher.hpp index 9a351749..fa6f6261 100644 --- a/src/portfft/dispatcher/workitem_dispatcher.hpp +++ b/src/portfft/dispatcher/workitem_dispatcher.hpp @@ -113,12 +113,18 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag T scaling_factor = kh.get_specialization_constant()>(); const Idx fft_size = kh.get_specialization_constant(); + const IdxGlobal input_stride = kh.get_specialization_constant(); + const IdxGlobal output_stride = kh.get_specialization_constant(); + const IdxGlobal input_distance = kh.get_specialization_constant(); + const IdxGlobal output_distance = kh.get_specialization_constant(); global_data.log_message_global(__func__, "entered", "fft_size", fft_size, "n_transforms", n_transforms); bool interleaved_storage = storage == complex_storage::INTERLEAVED_COMPLEX; const Idx n_reals = 2 * fft_size; const Idx n_io_reals = interleaved_storage ? n_reals : fft_size; + const IdxGlobal input_distance_in_reals = interleaved_storage ? 2 * input_distance : input_distance; + const IdxGlobal output_distance_in_reals = interleaved_storage ? 2 * output_distance : output_distance; #ifdef PORTFFT_USE_SCLA T wi_private_scratch[detail::SpecConstWIScratchSize]; @@ -130,8 +136,6 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag T priv[2 * MaxComplexPerWI]; #endif Idx subgroup_local_id = static_cast(global_data.sg.get_local_linear_id()); - IdxGlobal global_id = static_cast(global_data.it.get_global_id(0)); - IdxGlobal global_size = static_cast(global_data.it.get_global_range(0)); Idx subgroup_id = static_cast(global_data.sg.get_group_id()); Idx local_offset = n_reals * SubgroupSize * subgroup_id; Idx local_imag_offset = fft_size * SubgroupSize; @@ -140,24 +144,67 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag auto loc_load_modifier_view = detail::padded_view(loc_load_modifier, BankLinesPerPad); auto loc_store_modifier_view = detail::padded_view(loc_store_modifier, BankLinesPerPad); - for (IdxGlobal i = global_id; i < round_up_to_multiple(n_transforms, static_cast(SubgroupSize)); - i += global_size) { - bool working = i < n_transforms; - Idx n_working = sycl::min(SubgroupSize, static_cast(n_transforms - i) + subgroup_local_id); - - IdxGlobal global_offset = static_cast(n_io_reals) * (i - static_cast(subgroup_local_id)); - if (LayoutIn == detail::layout::PACKED) { - if (storage == complex_storage::INTERLEAVED_COMPLEX) { - global_data.log_message_global(__func__, "loading non-transposed data from global to local memory"); - global2local(global_data, input, loc_view, n_reals * n_working, global_offset, - local_offset); + const IdxGlobal transform_idx_begin = static_cast(global_data.it.get_global_id(0)); + const IdxGlobal transform_idx_step = static_cast(global_data.it.get_global_range(0)); + const IdxGlobal transform_idx_end = round_up_to_multiple(n_transforms, static_cast(SubgroupSize)); + for (IdxGlobal i = transform_idx_begin; i < transform_idx_end; i += transform_idx_step) { + const bool working = i < n_transforms; + IdxGlobal leader_i = i - static_cast(subgroup_local_id); + + Idx n_working = sycl::min(SubgroupSize, static_cast(n_transforms - leader_i)); + IdxGlobal global_offset = static_cast(n_io_reals) * leader_i; + IdxGlobal global_input_offset = static_cast(input_distance_in_reals) * leader_i; + IdxGlobal global_output_offset = static_cast(output_distance_in_reals) * leader_i; + + // This is checking for LayoutIn is PACKED or UNPACKED but we don't actually ever launch kernels with LayoutIn + // as UNPACKED. + if (LayoutIn == detail::layout::PACKED || LayoutIn == detail::layout::UNPACKED) { + // copy into local memory cooperatively as a subgroup, allowing coalesced memory access for when elements of a + // single FFT are sequential. BATCH_INTERLEAVED skips this step and loads straight from global to registers since + // the sequential work-items already access sequential elements. + if (input_stride == 1 && input_distance == fft_size) { + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + global_data.log_message_global(__func__, "loading packed data from global to local memory"); + global2local(global_data, input, loc_view, n_reals * n_working, global_offset, + local_offset); + } else { + global_data.log_message_global(__func__, "loading packed real data from global to local memory"); + global2local(global_data, input, loc_view, fft_size * n_working, global_offset, + local_offset); + global_data.log_message_global(__func__, "loading packed imaginary data from global to local memory"); + global2local(global_data, input_imag, loc_view, fft_size * n_working, + global_offset, local_offset + local_imag_offset); + } } else { - global_data.log_message_global(__func__, "loading non-transposed real data from global to local memory"); - global2local(global_data, input, loc_view, fft_size * n_working, global_offset, - local_offset); - global_data.log_message_global(__func__, "loading non-transposed imaginary data from global to local memory"); - global2local(global_data, input_imag, loc_view, fft_size * n_working, - global_offset, local_offset + local_imag_offset); + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + // NOTE this is potentially less optimal when input_distance < input_stride. In that case, the data is not + // read is order of memory address and will potentially not be as coalesed as possible. For input_distance < + // input_stride, we could also skip loading to local memory, since each work-item will be only loading it's + // own data anyway. + std::array global_strides{input_distance * 2, input_stride * 2, 1}; + std::array local_strides{fft_size * 2, 2, 1}; + std::array copy_indices{n_working, fft_size, 2}; + + detail::md_view global_input_view{input, global_strides, global_input_offset}; + detail::md_view local_input_view{loc_view, local_strides, local_offset}; + + global_data.log_message_global(__func__, "loading unpacked data from global to local memory"); + copy_group(global_data, global_input_view, local_input_view, copy_indices); + } else { + std::array global_strides{input_distance, input_stride}; + std::array local_strides{fft_size, 1}; + std::array copy_indices{n_working, fft_size}; + + detail::md_view global_input_real_view{input, global_strides, global_input_offset}; + detail::md_view local_input_real_view{loc_view, local_strides, local_offset}; + detail::md_view global_input_imag_view{input_imag, global_strides, global_input_offset}; + detail::md_view local_input_imag_view{loc_view, local_strides, local_offset + local_imag_offset}; + + global_data.log_message_global(__func__, "loading unpacked real data from global to local memory"); + copy_group(global_data, global_input_real_view, local_input_real_view, copy_indices); + global_data.log_message_global(__func__, "loading unpacked imaginary data from global to local memory"); + copy_group(global_data, global_input_imag_view, local_input_imag_view, copy_indices); + } } #ifdef PORTFFT_LOG_DUMPS sycl::group_barrier(global_data.sg); @@ -199,6 +246,7 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag } } global_data.log_dump_private("data loaded in registers:", priv, n_reals); + if (multiply_on_load == detail::elementwise_multiply::APPLIED) { // Assumes load modifier data is stored in a transposed fashion (fft_size x num_batches_local_mem) // to ensure much lesser bank conflicts @@ -213,6 +261,7 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag conjugate_inplace(priv, fft_size); } global_data.log_dump_private("data in registers after computation:", priv, n_reals); + if (multiply_on_store == detail::elementwise_multiply::APPLIED) { // Assumes store modifier data is stored in a transposed fashion (fft_size x num_batches_local_mem) // to ensure much lesser bank conflicts @@ -227,8 +276,9 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag } } global_data.log_dump_private("data in registers after scaling:", priv, n_reals); - global_data.log_message_global(__func__, "loading data from private to local memory"); - if (LayoutOut == detail::layout::PACKED) { + + if (LayoutOut == detail::layout::PACKED || LayoutOut == detail::layout::UNPACKED) { + global_data.log_message_global(__func__, "loading data from private to local memory"); if (storage == complex_storage::INTERLEAVED_COMPLEX) { detail::offset_view offset_local_view{loc_view, local_offset + subgroup_local_id * n_reals}; copy_wi(global_data, priv, offset_local_view, n_reals); @@ -255,18 +305,48 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag } } } - if (LayoutOut == detail::layout::PACKED) { + // This is checking for LayoutOut is PACKED or UNPACKED but we don't actually ever launch kernels with LayoutOut + // as UNPACKED, but just use PACKED and check the stride. + if (LayoutOut == detail::layout::PACKED || LayoutOut == detail::layout::UNPACKED) { sycl::group_barrier(global_data.sg); global_data.log_dump_local("computed data local memory:", loc, n_reals * n_working); - global_data.log_message_global(__func__, "storing data from local to global memory"); - if (storage == complex_storage::INTERLEAVED_COMPLEX) { - local2global(global_data, loc_view, output, n_reals * n_working, local_offset, - global_offset); + if (output_stride == 1 && output_distance == fft_size) { + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + global_data.log_message_global(__func__, "storing data from local to packed global memory"); + local2global(global_data, loc_view, output, n_reals * n_working, local_offset, + global_offset); + } else { + global_data.log_message_global(__func__, "storing real data from local to packed global memory"); + local2global(global_data, loc_view, output, fft_size * n_working, local_offset, + global_offset); + global_data.log_message_global(__func__, "storing imaginary data from local to packed global memory"); + local2global(global_data, loc_view, output_imag, fft_size * n_working, + local_offset + local_imag_offset, global_output_offset); + } } else { - local2global(global_data, loc_view, output, fft_size * n_working, local_offset, - global_offset); - local2global(global_data, loc_view, output_imag, fft_size * n_working, - local_offset + local_imag_offset, global_offset); + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + std::array global_strides{output_distance * 2, output_stride * 2, 1}; + std::array local_strides{fft_size * 2, 2, 1}; + std::array copy_indices{n_working, fft_size, 2}; + + detail::md_view global_output_view{output, global_strides, global_output_offset}; + detail::md_view local_output_view{loc_view, local_strides, local_offset}; + global_data.log_message_global(__func__, "storing data from local to unpacked global memory"); + copy_group(global_data, local_output_view, global_output_view, copy_indices); + } else { + std::array global_strides{output_distance, output_stride}; + std::array local_strides{fft_size, 1}; + std::array copy_indices{n_working, fft_size}; + + detail::md_view global_output_real_view{output, global_strides, global_output_offset}; + detail::md_view local_output_real_view{loc_view, local_strides, local_offset}; + detail::md_view global_output_imag_view{output_imag, global_strides, global_output_offset}; + detail::md_view local_output_imag_view{loc_view, local_strides, local_offset + local_imag_offset}; + global_data.log_message_global(__func__, "storing real data from local to unpacked global memory"); + copy_group(global_data, local_output_real_view, global_output_real_view, copy_indices); + global_data.log_message_global(__func__, "storing imaginary data from local to unpacked global memory"); + copy_group(global_data, local_output_imag_view, global_output_imag_view, copy_indices); + } } sycl::group_barrier(global_data.sg); } @@ -292,6 +372,7 @@ struct committed_descriptor_impl::run_kernel_struct(detail::get_global_size_workitem( n_transforms, SubgroupSize, kernel_data.num_sgs_per_wg, desc.n_compute_units)); + return desc.queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.use_kernel_bundle(kernel_data.exec_bundle); @@ -333,12 +414,11 @@ template template struct committed_descriptor_impl::set_spec_constants_struct::inner { static void execute(committed_descriptor_impl& /*desc*/, sycl::kernel_bundle& in_bundle, - std::size_t length, const std::vector& /*factors*/, detail::level /*level*/, - Idx /*factor_num*/, Idx /*num_factors*/) { + Idx length, const std::vector& /*factors*/, detail::level /*level*/, Idx /*factor_num*/, + Idx /*num_factors*/) { PORTFFT_LOG_FUNCTION_ENTRY(); - const Idx length_idx = static_cast(length); - PORTFFT_LOG_TRACE("SpecConstFftSize:", length_idx); - in_bundle.template set_specialization_constant(length_idx); + PORTFFT_LOG_TRACE("SpecConstFftSize:", length); + in_bundle.template set_specialization_constant(length); } }; diff --git a/src/portfft/enums.hpp b/src/portfft/enums.hpp index bbffac64..0240ec63 100644 --- a/src/portfft/enums.hpp +++ b/src/portfft/enums.hpp @@ -43,6 +43,20 @@ enum class pad { DONT_PAD, DO_PAD }; enum class level { WORKITEM, SUBGROUP, WORKGROUP, GLOBAL }; +constexpr const char* level_to_string(level l) { + switch (l) { + case level::WORKITEM: + return "WORKITEM"; + case level::SUBGROUP: + return "SUBGROUP"; + case level::WORKGROUP: + return "WORKGROUP"; + case level::GLOBAL: + return "GLOBAL"; + } + return "UNKNOWN"; +} + enum class layout { /// Packed layout represents default strides and distance. /// Each FFT is contiguous and each FFT is stored one after the other. @@ -56,6 +70,18 @@ enum class layout { BATCH_INTERLEAVED }; +constexpr const char* layout_to_string(layout l) { + switch (l) { + case layout::PACKED: + return "PACKED"; + case layout::UNPACKED: + return "UNPACKED"; + case layout::BATCH_INTERLEAVED: + return "BATCH_INTERLEAVED"; + } + return "UNKNOWN"; +} + enum class memory { BUFFER, USM }; enum class transfer_direction { diff --git a/src/portfft/specialization_constant.hpp b/src/portfft/specialization_constant.hpp index 68bca570..713ff358 100644 --- a/src/portfft/specialization_constant.hpp +++ b/src/portfft/specialization_constant.hpp @@ -26,12 +26,17 @@ #include "defines.hpp" #include "enums.hpp" -namespace portfft { -namespace detail { +namespace portfft::detail { constexpr static sycl::specialization_id SpecConstFftSize{}; constexpr static sycl::specialization_id SpecConstNumRealsPerFFT{}; constexpr static sycl::specialization_id SpecConstWIScratchSize{}; + +constexpr static sycl::specialization_id SpecConstInputStride{}; +constexpr static sycl::specialization_id SpecConstOutputStride{}; +constexpr static sycl::specialization_id SpecConstInputDistance{}; +constexpr static sycl::specialization_id SpecConstOutputDistance{}; + constexpr static sycl::specialization_id SpecConstComplexStorage{}; constexpr static sycl::specialization_id SpecConstMultiplyOnLoad{}; constexpr static sycl::specialization_id SpecConstMultiplyOnStore{}; @@ -51,6 +56,5 @@ constexpr static sycl::specialization_id SpecConstCon constexpr static sycl::specialization_id SpecConstScaleFactorFloat{}; constexpr static sycl::specialization_id SpecConstScaleFactorDouble{}; -} // namespace detail -} // namespace portfft +} // namespace portfft::detail #endif diff --git a/src/portfft/utils.hpp b/src/portfft/utils.hpp index 3a79fbf3..bfe518ab 100644 --- a/src/portfft/utils.hpp +++ b/src/portfft/utils.hpp @@ -29,6 +29,7 @@ #include "common/logging.hpp" #include "defines.hpp" #include "enums.hpp" +#include "specialization_constant.hpp" namespace portfft { namespace detail { @@ -193,6 +194,69 @@ PORTFFT_INLINE constexpr const sycl::specialization_id& get_spec_constan } } +/** + * Return the default strides for a given dft size + * + * @param lengths the dimensions of the dft + */ +inline std::vector get_default_strides(const std::vector& lengths) { + PORTFFT_LOG_FUNCTION_ENTRY(); + std::vector strides(lengths.size()); + std::size_t total_size = 1; + for (std::size_t i_plus1 = lengths.size(); i_plus1 > 0; i_plus1--) { + std::size_t i = i_plus1 - 1; + strides[i] = total_size; + total_size *= lengths[i]; + } + PORTFFT_LOG_TRACE("Default strides:", strides); + return strides; +} + +/** + * Return whether the given descriptor has default strides and distance for a given direction + * + * @tparam Descriptor Descriptor type + * @param desc Descriptor to check + * @param dir Direction + */ +template +bool has_default_strides_and_distance(const Descriptor& desc, direction dir) { + const auto default_strides = get_default_strides(desc.lengths); + const auto default_distance = desc.get_flattened_length(); + return desc.get_strides(dir) == default_strides && desc.get_distance(dir) == default_distance; +} + +/** + * Return whether the given descriptor has strides and distance consistent with the batch interleaved layout + * + * @tparam Descriptor Descriptor type + * @param desc Descriptor to check + * @param dir Direction + */ +template +bool is_batch_interleaved(const Descriptor& desc, direction dir) { + return desc.lengths.size() == 1 && desc.get_distance(dir) == 1 && + desc.get_strides(dir).back() == desc.number_of_transforms; +} + +/** + * Return an enum describing the layout of the data in the descriptor + * + * @tparam Descriptor Descriptor type + * @param desc Descriptor to check + * @param dir Direction + */ +template +detail::layout get_layout(const Descriptor& desc, direction dir) { + if (has_default_strides_and_distance(desc, dir)) { + return detail::layout::PACKED; + } + if (is_batch_interleaved(desc, dir)) { + return detail::layout::BATCH_INTERLEAVED; + } + return detail::layout::UNPACKED; +} + } // namespace detail } // namespace portfft #endif diff --git a/test/common/reference_data_wrangler.hpp b/test/common/reference_data_wrangler.hpp index ab1b0588..a35ec8eb 100644 --- a/test/common/reference_data_wrangler.hpp +++ b/test/common/reference_data_wrangler.hpp @@ -35,20 +35,55 @@ #include #include +// Used to create padding that is either a scalar or a complex value with equal real and imaginary parts. +template +T padding_representation(float p) { + if constexpr (std::is_floating_point_v) { + return p; + } else { + static_assert(std::is_same_v> || std::is_same_v>); + return {p, p}; + } +} + /** - * Runs Out of place transpose - * - * @tparam T Input Type - * @param in input pointer - * @param dft_len innermost dimension of the input - * @param batches innermost dimension of the output + * Reshare the packed reference data to the layout specified in \p desc. */ -template -std::vector transpose(const std::vector& in, std::size_t dft_len, std::size_t batches) { - std::vector out(in.size()); - for (std::size_t j = 0; j < dft_len; j++) { - for (std::size_t i = 0; i < batches; i++) { - out.at(i + j * batches) = in.at(j + i * dft_len); +template +std::vector reshape_to_desc(const std::vector& in, const Descriptor& desc, + portfft::detail::layout layout, portfft::direction dir, float padding_value) { + const auto flat_len = desc.get_flattened_length(); + + // assume we are starting with the packed format of the descriptor + assert(in.size() == flat_len * desc.number_of_transforms); + + // padding is added during initialization + std::vector out(desc.get_input_count(dir), padding_representation(padding_value)); + + const auto offset = static_cast(desc.get_offset(dir)); + + if (layout == portfft::detail::layout::PACKED) { + std::copy(in.cbegin(), in.cend(), out.begin() + offset); + } else { + // only handling 1D for now + assert(desc.lengths.size() == 1); + const auto stride = desc.get_strides(dir).back(); + const auto distance = desc.get_distance(dir); + + // add strides and distances + InType const* in_iter = in.data(); + InType* out_batch_iter = out.data() + offset; + for (std::size_t b = 0; b != desc.number_of_transforms; b += 1) { + InType* out_transform_iter = out_batch_iter; + + for (std::size_t e = 0; e != flat_len; e += 1) { + *out_transform_iter = *in_iter; + + in_iter += 1; + out_transform_iter += stride; + } + + out_batch_iter += distance; } } return out; @@ -59,7 +94,6 @@ std::vector transpose(const std::vector& in, std::size_t dft_len, std::siz * @tparam Storage complex storage to use * @tparam Scalar type of the scalar used for computations * @tparam Domain domain of the FFT - * @tparam PaddingT type of the padding value * @param desc The description of the FFT * @param padding_value The value to use in memory locations that are not expected to be read or written. * @param layout_in layout (PACKED/BATCH_INTERLEAVED) of the input data @@ -69,13 +103,13 @@ std::vector transpose(const std::vector& in, std::size_t dft_len, std::siz *`Storage` is split, first two values contain input and output real part and the last two input and output imaginary *part of the data. **/ -template +template auto gen_fourier_data(portfft::descriptor& desc, portfft::detail::layout layout_in, - portfft::detail::layout layout_out, PaddingT padding_value) { + portfft::detail::layout layout_out, float padding_value) { constexpr bool IsRealDomain = Domain == portfft::domain::REAL; constexpr bool IsForward = Dir == portfft::direction::FORWARD; constexpr bool IsInterleaved = Storage == portfft::complex_storage::INTERLEAVED_COMPLEX; + constexpr bool debug_input = false; const auto batches = desc.number_of_transforms; const auto& dims = desc.lengths; @@ -84,18 +118,27 @@ auto gen_fourier_data(portfft::descriptor& desc, portfft::detail "python3 -c \"" "import numpy as np\n" "from sys import stdout\n" - "def gen_data(batch, dims, is_complex, is_double):\n" + "def gen_data(batch, dims, is_complex, is_double, debug_input):\n" " scalar_type = np.double if is_double else np.single\n" " complex_type = np.complex128 if is_double else np.complex64\n" + " forward_type = complex_type if is_complex else scalar_type\n" " dataGenDims = [batch] + dims\n" - " rng = np.random.Generator(np.random.SFC64(0))\n" - " inData = rng.uniform(-1, 1, dataGenDims).astype(scalar_type)\n" + "\n" + " if (debug_input):\n" + " inData = np.arange(np.prod(dataGenDims)).reshape(dataGenDims).astype(forward_type) + 7j\n" + " else:\n" + " rng = np.random.Generator(np.random.SFC64(0))\n" + " inData = rng.uniform(-1, 1, dataGenDims).astype(scalar_type)\n" + " if (is_complex):\n" + " inData = inData + 1j * rng.uniform(-1, 1, dataGenDims).astype(scalar_type)\n" + "\n" " if (is_complex):\n" - " inData = inData + 1j * rng.uniform(-1, 1, dataGenDims).astype(scalar_type)\n" " outData = np.fft.fftn(inData, axes=range(1, len(dims) + 1))\n" " else:\n" " outData = np.fft.rfftn(inData, axes=range(1, len(dims) + 1))\n" + " # outData is always double precision at this point\n" " outData = outData.astype(complex_type)\n" + "\n" " # input and output shape is irrelevant when outputting the buffer\n" " stdout.buffer.write(inData.tobytes())\n" " stdout.buffer.write(outData.tobytes())\n" @@ -117,6 +160,8 @@ auto gen_fourier_data(portfft::descriptor& desc, portfft::detail command << "," << (std::is_same_v ? "False" : "True"); + command << "," << (debug_input ? "True" : "False"); + command << ")\""; FILE* f = popen(command.str().c_str(), "r"); @@ -152,22 +197,6 @@ auto gen_fourier_data(portfft::descriptor& desc, portfft::detail throw std::runtime_error("Reference data was not transferred correctly"); } - // modify layout - if (layout_in == portfft::detail::layout::BATCH_INTERLEAVED) { - if constexpr (IsForward) { - forward = transpose(forward, elements / batches, desc.number_of_transforms); - } else { - backward = transpose(backward, backward_elements / batches, desc.number_of_transforms); - } - } - if (layout_out == portfft::detail::layout::BATCH_INTERLEAVED) { - if constexpr (IsForward) { - backward = transpose(backward, backward_elements / batches, desc.number_of_transforms); - } else { - forward = transpose(forward, elements / batches, desc.number_of_transforms); - } - } - // Apply scaling factor to the output // Do this before adding offset to avoid scaling the offsets if (IsForward) { @@ -180,17 +209,12 @@ auto gen_fourier_data(portfft::descriptor& desc, portfft::detail std::for_each(forward.begin(), forward.end(), [scaling_factor](auto& x) { x *= scaling_factor; }); } - auto insert_offset = [=](auto inputVec, std::size_t offset) { - using InputT = decltype(inputVec); - std::ptrdiff_t fill_sz = static_cast(offset); - InputT outputVec(inputVec.size() + offset); - std::fill(outputVec.begin(), outputVec.begin() + fill_sz, static_cast(padding_value)); - std::copy(inputVec.cbegin(), inputVec.cend(), outputVec.begin() + fill_sz); - return outputVec; - }; + const auto layout_fwd = IsForward ? layout_in : layout_out; + const auto layout_bwd = IsForward ? layout_out : layout_in; - forward = insert_offset(forward, desc.forward_offset); - backward = insert_offset(backward, desc.backward_offset); + // modify layout + forward = reshape_to_desc(forward, desc, layout_fwd, portfft::direction::FORWARD, padding_value); + backward = reshape_to_desc(backward, desc, layout_bwd, portfft::direction::BACKWARD, padding_value); std::vector forward_real; std::vector backward_real; @@ -262,16 +286,17 @@ void verify_dft(const portfft::descriptor& desc, const std::vect "Expected real data type for real backward / split complex dft verification."); } - auto data_shape = desc.lengths; - - if constexpr (IsForward && Domain == portfft::domain::REAL) { - data_shape.back() = data_shape.back() / 2 + 1; + if (ref_output.size() != actual_output.size()) { + std::cerr << "expect the reference size (" << ref_output.size() << ") and the actual size (" << actual_output.size() + << ") to be the same." << std::endl; + throw std::runtime_error("Verification Failed"); } - // TODO: Update this to take into account stride and distance. - std::size_t dft_len = std::accumulate(data_shape.cbegin(), data_shape.cend(), std::size_t(1), std::multiplies<>()); + const auto dft_len = desc.get_flattened_length(); + const auto dft_offset = IsForward ? desc.backward_offset : desc.forward_offset; + const auto dft_stride = IsForward ? desc.backward_strides.back() : desc.forward_strides.back(); + const auto dft_distance = IsForward ? desc.backward_distance : desc.forward_distance; - auto dft_offset = IsForward ? desc.backward_offset : desc.forward_offset; for (std::size_t i = 0; i < dft_offset; ++i) { if (ref_output[i] != actual_output[i]) { if constexpr (!IsInterleaved) { @@ -299,19 +324,20 @@ void verify_dft(const portfft::descriptor& desc, const std::vect Scalar max_L2_rel_err = 0; for (std::size_t t = 0; t < desc.number_of_transforms; ++t) { - const ElemT* this_batch_ref = ref_output.data() + dft_len * t + dft_offset; - const ElemT* this_batch_computed = actual_output.data() + dft_len * t + dft_offset; - const ElemT* this_batch_ref_imag = ref_output_imag.data() + dft_len * t + dft_offset; - const ElemT* this_batch_computed_imag = actual_output_imag.data() + dft_len * t + dft_offset; + const ElemT* this_batch_ref = ref_output.data() + dft_distance * t + dft_offset; + const ElemT* this_batch_computed = actual_output.data() + dft_distance * t + dft_offset; + const ElemT* this_batch_ref_imag = ref_output_imag.data() + dft_distance * t + dft_offset; + const ElemT* this_batch_computed_imag = actual_output_imag.data() + dft_distance * t + dft_offset; Scalar L2_err = 0; Scalar L2_norm = 0; for (std::size_t e = 0; e != dft_len; ++e) { - BwdType computed_val = this_batch_computed[e]; - BwdType ref_val = this_batch_ref[e]; + const auto batch_offset = e * dft_stride; + BwdType computed_val = this_batch_computed[batch_offset]; + BwdType ref_val = this_batch_ref[batch_offset]; if constexpr (!IsInterleaved) { - computed_val += std::complex(0, this_batch_computed_imag[e]); - ref_val += std::complex(0, this_batch_ref_imag[e]); + computed_val += std::complex(0, this_batch_computed_imag[batch_offset]); + ref_val += std::complex(0, this_batch_ref_imag[batch_offset]); } Scalar err = std::abs(computed_val - ref_val); Scalar norm_val = std::abs(ref_val); @@ -326,27 +352,20 @@ void verify_dft(const portfft::descriptor& desc, const std::vect // set to warning to make it print by default PORTFFT_LOG_WARNING("Max (across batches) relative L2 error: ", max_L2_rel_err); - for (std::size_t t = 0; t < desc.number_of_transforms; ++t) { - const ElemT* this_batch_ref = ref_output.data() + dft_len * t + dft_offset; - const ElemT* this_batch_computed = actual_output.data() + dft_len * t + dft_offset; - const ElemT* this_batch_ref_imag = ref_output_imag.data() + dft_len * t + dft_offset; - const ElemT* this_batch_computed_imag = actual_output_imag.data() + dft_len * t + dft_offset; - - for (std::size_t e = 0; e != dft_len; ++e) { - BwdType computed_val = this_batch_computed[e]; - BwdType ref_val = this_batch_ref[e]; - if constexpr (!IsInterleaved) { - computed_val += std::complex(0, this_batch_computed_imag[e]); - ref_val += std::complex(0, this_batch_ref_imag[e]); - } - Scalar diff = std::abs(computed_val - ref_val); - if (diff > comparison_tolerance && diff / std::abs(this_batch_computed[e]) > comparison_tolerance) { - // std::endl is used intentionally to flush the error message before google test exits the test. - std::cerr << "transform " << t << ", element " << e << ", with global idx " << t * dft_len + e - << ", does not match\nref " << ref_val << " vs " << computed_val << "\ndiff " << diff - << ", tolerance " << comparison_tolerance << std::endl; - throw std::runtime_error("Verification Failed"); - } + for (std::size_t i = dft_offset; i < ref_output.size(); i += 1) { + BwdType ref_val = ref_output[i]; + BwdType computed_val = actual_output[i]; + if constexpr (!IsInterleaved) { + ref_val += std::complex(0, ref_output_imag[i]); + computed_val += std::complex(0, actual_output_imag[i]); + } + const auto abs_diff = std::abs(computed_val - ref_val); + const auto rel_diff = abs_diff / std::abs(computed_val); + if (abs_diff > comparison_tolerance && rel_diff > comparison_tolerance) { + // std::endl is used intentionally to flush the error message before google test exits the test. + std::cerr << "value at index " << i << " does not match\nref " << ref_val << " vs " << computed_val << "\ndiff " + << abs_diff << ", tolerance " << comparison_tolerance << std::endl; + throw std::runtime_error("Verification Failed"); } } } diff --git a/test/unit_test/fft_test_utils.hpp b/test/unit_test/fft_test_utils.hpp index 7d152d68..a8643b39 100644 --- a/test/unit_test/fft_test_utils.hpp +++ b/test/unit_test/fft_test_utils.hpp @@ -50,6 +50,30 @@ struct test_placement_layouts_params { detail::layout output_layout; }; +struct layout_params { + std::vector lengths; + std::vector forward_strides; + std::vector backward_strides; + std::size_t forward_distance; + std::size_t backward_distance; + layout_params(std::vector lengths, std::vector forward_strides, + std::vector backward_strides) + : lengths(lengths), forward_strides(forward_strides), backward_strides(backward_strides) { + forward_distance = 1; + backward_distance = 1; + for (std::size_t i = 0; i < lengths.size(); i += 1) { + forward_distance *= lengths[i] * forward_strides[i]; + backward_distance *= lengths[i] * backward_strides[i]; + } + } + layout_params(std::vector lengths, std::vector forward_strides, + std::vector backward_strides, std::size_t forward_distance, std::size_t backward_distance) + : lengths(lengths), + forward_strides(forward_strides), + backward_strides(backward_strides), + forward_distance(forward_distance), + backward_distance(backward_distance) {} +}; using basic_param_tuple = std::tuple /*lengths*/>; using offsets_param_tuple = @@ -58,6 +82,8 @@ using offsets_param_tuple = using scales_param_tuple = std::tuple /*lengths*/, double /*forward_scale*/, double /*backward_scale*/>; +using layout_param_tuple = std::tuple; // More tuples can be added here to easily instantiate tests that will require different parameters struct test_params { @@ -72,6 +98,7 @@ struct test_params { std::optional backward_scale; std::optional forward_offset; std::optional backward_offset; + std::optional strides; test_params() = default; @@ -95,6 +122,11 @@ struct test_params { forward_scale = std::get<5>(params); backward_scale = std::get<6>(params); } + explicit test_params(layout_param_tuple params) + : test_params(basic_param_tuple{std::get<0>(params), std::get<1>(params), std::get<2>(params), + std::get<3>(params), std::get<4>(params).lengths}) { + strides = std::get<4>(params); + } }; /// Structure used by GTest to generate the test name @@ -102,19 +134,6 @@ struct test_params_print { std::string operator()(const testing::TestParamInfo& info) const { auto params = info.param; std::stringstream ss; - auto print_layout = [&ss](detail::layout layout) { - if (layout == detail::layout::PACKED) { - ss << "PACKED"; - } else if (layout == detail::layout::BATCH_INTERLEAVED) { - ss << "BATCH_INTERLEAVED"; - } - }; - auto print_double = [&](double d) { - std::string fp_str = std::to_string(d); - std::replace(fp_str.begin(), fp_str.end(), '-', 'm'); - std::replace(fp_str.begin(), fp_str.end(), '.', '_'); - ss << fp_str; - }; ss << "Placement_"; if (params.placement == placement::IN_PLACE) { @@ -122,17 +141,38 @@ struct test_params_print { } else if (params.placement == placement::OUT_OF_PLACE) { ss << "OOP"; } - ss << "__LayoutIn_"; - print_layout(params.input_layout); - ss << "__LayoutOut_"; - print_layout(params.output_layout); + + ss << "__LayoutIn_" << layout_to_string(params.input_layout); + ss << "__LayoutOut_" << layout_to_string(params.output_layout); + + if (params.strides) { + ss << "__FwdStrides"; + for (std::size_t s : params.strides.value().forward_strides) { + ss << "_" << s; + } + ss << "__FwdDistance_" << params.strides.value().forward_distance; + ss << "__BwdStrides"; + for (std::size_t s : params.strides.value().backward_strides) { + ss << "_" << s; + } + ss << "__BwdDistance_" << params.strides.value().backward_distance; + } + ss << "__Direction_" << (params.dir == direction::FORWARD ? "Fwd" : "Bwd"); ss << "__Storage_" << (params.storage == complex_storage::INTERLEAVED_COMPLEX ? "Interleaved" : "Split"); ss << "__Batch_" << params.batch; + ss << "__Lengths"; for (std::size_t length : params.lengths) { ss << "_" << length; } + + auto print_double = [&](double d) { + std::string fp_str = std::to_string(d); + std::replace(fp_str.begin(), fp_str.end(), '-', 'm'); + std::replace(fp_str.begin(), fp_str.end(), '.', '_'); + ss << fp_str; + }; if (params.forward_scale) { ss << "__FwdScale_"; print_double(*params.forward_scale); @@ -141,12 +181,14 @@ struct test_params_print { ss << "__BwdScale_"; print_double(*params.backward_scale); } + if (params.forward_offset) { ss << "__FwdOffset_" << *params.forward_offset; } if (params.backward_offset) { ss << "__BwdOffset_" << *params.backward_offset; } + return ss.str(); } }; @@ -174,17 +216,10 @@ auto get_descriptor(const test_params& params) { desc.complex_storage = params.storage; auto apply_layout_for_dir = [&desc, ¶ms](detail::layout layout, direction dir) { - if (layout == detail::layout::PACKED) { - // Keep default strides and set default distance for the PACKED layout if needed - if (desc.number_of_transforms > 1) { - desc.get_distance(dir) = desc.get_flattened_length(); - } - } else if (layout == detail::layout::BATCH_INTERLEAVED) { + if (layout == detail::layout::BATCH_INTERLEAVED) { // Set default strides and distance for the batch interleaved layout desc.get_strides(dir) = {static_cast(params.batch)}; desc.get_distance(dir) = 1; - } else { - throw std::runtime_error("Unsupported layout"); } }; // First set input strides and distance if needed then output ones @@ -203,6 +238,13 @@ auto get_descriptor(const test_params& params) { if (params.backward_offset) { desc.backward_offset = *params.backward_offset; } + if (params.strides) { + auto& strides = params.strides.value(); + desc.forward_strides = strides.forward_strides; + desc.forward_distance = strides.forward_distance; + desc.backward_strides = strides.backward_strides; + desc.backward_distance = strides.backward_distance; + } return desc; } @@ -410,8 +452,11 @@ void run_test(const test_params& params) { float padding_value = -5.f; // Value for memory that isn't written to. auto [host_input, host_reference_output, host_input_imag, host_reference_output_imag] = gen_fourier_data(desc, params.input_layout, params.output_layout, padding_value); - decltype(host_reference_output) host_output(desc.get_output_count(params.dir), padding_value); - decltype(host_reference_output_imag) host_output_imag( + using reference_container = decltype(host_reference_output); + using reference_imag_container = decltype(host_reference_output_imag); + reference_container host_output(desc.get_output_count(params.dir), + padding_representation(padding_value)); + reference_imag_container host_output_imag( Storage == complex_storage::SPLIT_COMPLEX ? desc.get_output_count(params.dir) : 0, padding_value); double n_elems = static_cast( std::accumulate(params.lengths.begin(), params.lengths.end(), 1ull, std::multiplies())); diff --git a/test/unit_test/instantiate_fft_tests.hpp b/test/unit_test/instantiate_fft_tests.hpp index d96449e1..0a900bea 100644 --- a/test/unit_test/instantiate_fft_tests.hpp +++ b/test/unit_test/instantiate_fft_tests.hpp @@ -30,6 +30,7 @@ // Mandatory parameters: placement, layout, direction, batch, lengths // Optional parameters: [forward_scale, backward_scale] class FFTTest : public ::testing::TestWithParam {}; +class InvalidFFTTest : public ::testing::TestWithParam {}; using sizes_t = std::vector; @@ -64,9 +65,17 @@ auto ip_packed_layout = ::testing::Values( test_placement_layouts_params{placement::IN_PLACE, detail::layout::PACKED, detail::layout::PACKED}); auto ip_batch_interleaved_layout = ::testing::Values(test_placement_layouts_params{ placement::IN_PLACE, detail::layout::BATCH_INTERLEAVED, detail::layout::BATCH_INTERLEAVED}); +auto ip_unpacked_unpacked_layout = ::testing::Values( + test_placement_layouts_params{placement::IN_PLACE, detail::layout::UNPACKED, detail::layout::UNPACKED}); -auto oop_packed_layout = ::testing::Values( +auto oop_packed_packed_layout = ::testing::Values( test_placement_layouts_params{placement::OUT_OF_PLACE, detail::layout::PACKED, detail::layout::PACKED}); +auto oop_unpacked_unpacked_layout = ::testing::Values( + test_placement_layouts_params{placement::OUT_OF_PLACE, detail::layout::UNPACKED, detail::layout::UNPACKED}); + +auto all_unpacked_unpacked_layout = ::testing::Values( + test_placement_layouts_params{placement::IN_PLACE, detail::layout::UNPACKED, detail::layout::UNPACKED}, + test_placement_layouts_params{placement::OUT_OF_PLACE, detail::layout::UNPACKED, detail::layout::UNPACKED}); constexpr test_placement_layouts_params valid_global_layouts[] = { #ifdef PORTFFT_ENABLE_OOP_BUILDS @@ -211,18 +220,108 @@ INSTANTIATE_TEST_SUITE_P(OffsetsMDErrorRegressionTest, FFTTest, // Scaled FFTs test suite auto scales = ::testing::Values(-1.0, 2.0); INSTANTIATE_TEST_SUITE_P(FwdScaledFFTTest, FFTTest, - ::testing::ConvertGenerator( - ::testing::Combine(oop_packed_layout, fwd_only, interleaved_storage, ::testing::Values(3), - ::testing::Values(sizes_t{9}, sizes_t{16}, sizes_t{64}, sizes_t{512}, - sizes_t{4096}, sizes_t{16, 512}), - scales, ::testing::Values(1.0))), + ::testing::ConvertGenerator(::testing::Combine( + oop_packed_packed_layout, fwd_only, interleaved_storage, ::testing::Values(3), + ::testing::Values(sizes_t{9}, sizes_t{16}, sizes_t{64}, sizes_t{512}, sizes_t{4096}, + sizes_t{16, 512}), + scales, ::testing::Values(1.0))), test_params_print()); INSTANTIATE_TEST_SUITE_P(BwdScaledFFTTest, FFTTest, - ::testing::ConvertGenerator( - ::testing::Combine(oop_packed_layout, bwd_only, interleaved_storage, ::testing::Values(3), - ::testing::Values(sizes_t{9}, sizes_t{16}, sizes_t{64}, sizes_t{512}, - sizes_t{4096}, sizes_t{16, 512}), - ::testing::Values(1.0), scales)), + ::testing::ConvertGenerator(::testing::Combine( + oop_packed_packed_layout, bwd_only, interleaved_storage, ::testing::Values(3), + ::testing::Values(sizes_t{9}, sizes_t{16}, sizes_t{64}, sizes_t{512}, sizes_t{4096}, + sizes_t{16, 512}), + ::testing::Values(1.0), scales)), + test_params_print()); + +INSTANTIATE_TEST_SUITE_P(workItemStridedOOPInOrder, FFTTest, + ::testing::ConvertGenerator(::testing::Combine( + oop_unpacked_unpacked_layout, both_directions, complex_storages, + ::testing::Values(1, 3, 33000ul), + ::testing::Values(layout_params{{3}, {4}, {7}}, layout_params{{8}, {11}, {2}}, + layout_params{{9}, {3}, {4}, 30, 40}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(workItemStridedOOPLikeBatchInterleaved, FFTTest, + ::testing::ConvertGenerator( + ::testing::Combine(oop_unpacked_unpacked_layout, both_directions, complex_storages, + // keep batches below 33 + ::testing::Values(1, 10, 33), + ::testing::Values(layout_params{{8}, {33}, {99}, 1, 3}, + layout_params{{8}, {33}, {2}, 1, 16}, + layout_params{{8}, {2}, {66}, 16, 2}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(workItemStridedIP, FFTTest, + ::testing::ConvertGenerator(::testing::Combine( + ip_unpacked_unpacked_layout, both_directions, complex_storages, + ::testing::Values(1, 3, 33000ul), + ::testing::Values(layout_params{{3}, {4}, {4}}, layout_params{{9}, {3}, {3}, 25, 25}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P( + workItemStridedLikeBatchInterleaved, FFTTest, + ::testing::ConvertGenerator(::testing::Combine( + ip_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(1, 3, 33), + ::testing::Values(layout_params{{3}, {66}, {66}, 2, 2}, layout_params{{6}, {40}, {40}, 1, 1}))), + test_params_print()); + +// clang-format off +// Arbitrary interleaved FFT test suites +// The strides and distances are set so that no elements overlap but there are no single continuous dimension in memory either. +// This configuration is impractical but technically valid. For instance for n_batches=4, fft_size=4, stride=4, distance=3: +// Index in memory: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 +// Batch and FFT index: b0i0 b1i0 b0i1 b2i0 b1i1 b0i2 b3i0 b2i1 b1i2 b0i3 b3i1 b2i2 b1i3 b3i2 b2i3 b3i3 +// clang-format on +INSTANTIATE_TEST_SUITE_P(workItemStridedArbitraryInterleaved, FFTTest, + ::testing::ConvertGenerator(::testing::Combine( + all_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(4), + ::testing::Values(layout_params{{4}, {4}, {4}, 3, 3}))), + test_params_print()); + +// Invalid configurations test suite +INSTANTIATE_TEST_SUITE_P(InvalidLength, InvalidFFTTest, + ::testing::ConvertGenerator( + ::testing::Combine(all_valid_placement_layouts, both_directions, complex_storages, + ::testing::Values(1), ::testing::Values(0))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidBatch, InvalidFFTTest, + ::testing::ConvertGenerator( + ::testing::Combine(all_valid_placement_layouts, both_directions, complex_storages, + ::testing::Values(0), ::testing::Values(1))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidDistance, InvalidFFTTest, + ::testing::ConvertGenerator(::testing::Combine( + oop_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(2), + ::testing::Values(layout_params{{5}, {5}, {1}, 0, 5}, + layout_params{{5}, {1}, {5}, 5, 0}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidNonPositiveStrides, InvalidFFTTest, + ::testing::ConvertGenerator(::testing::Combine( + oop_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(1), + ::testing::Values(layout_params{{5}, {0}, {1}}, layout_params{{5}, {1}, {0}}, + layout_params{{5, 12}, {12, 1}, {12, 0}}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidShortDistance, InvalidFFTTest, + ::testing::ConvertGenerator(::testing::Combine( + oop_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(2), + ::testing::Values(layout_params{{8}, {1}, {1}, 7, 8}, + layout_params{{8, 4}, {8, 2}, {4, 1}, 24, 24}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidIPNotMatchingStridesDistance, InvalidFFTTest, + ::testing::ConvertGenerator(::testing::Combine( + ip_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(2), + ::testing::Values(layout_params{{8}, {2}, {1}, 16, 8}, + layout_params{{8, 4}, {8, 2}, {8, 2}, 48, 50}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidOverlap, InvalidFFTTest, + ::testing::ConvertGenerator(::testing::Combine( + oop_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(3), + ::testing::Values(layout_params{{4}, {1}, {1}, 1, 4}, + layout_params{{4}, {1}, {2}, 4, 3}))), + test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidOverlapLarge, InvalidFFTTest, + ::testing::ConvertGenerator( + ::testing::Combine(oop_unpacked_unpacked_layout, both_directions, complex_storages, + ::testing::Values(3333334), + ::testing::Values(layout_params{{8}, {3333333}, {3333333}, 1, 1}))), test_params_print()); #define INSTANTIATE_TESTS_FULL(TYPE, MEMORY) \ @@ -255,4 +354,12 @@ INSTANTIATE_TEST_SUITE_P(BwdScaledFFTTest, FFTTest, #define INSTANTIATE_TESTS(TYPE) INSTANTIATE_TESTS_FULL(TYPE, usm) #endif +// The result of this test should not be dependent on scalar type or memory type +TEST_P(InvalidFFTTest, Test) { + auto params = GetParam(); + sycl::queue queue; + auto desc = get_descriptor(params); + EXPECT_THROW(desc.commit(queue), portfft::invalid_configuration); +} + #endif From 52ce1d0ccdad3718284d7dc55c977f5fa5bdd7b3 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Thu, 1 Feb 2024 15:48:21 +0000 Subject: [PATCH 02/13] move validation to before the committed_descriptor constructor --- src/portfft/committed_descriptor_impl.hpp | 2 -- src/portfft/descriptor.hpp | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/portfft/committed_descriptor_impl.hpp b/src/portfft/committed_descriptor_impl.hpp index 1f472df8..dce91cd0 100644 --- a/src/portfft/committed_descriptor_impl.hpp +++ b/src/portfft/committed_descriptor_impl.hpp @@ -32,7 +32,6 @@ #include "common/exceptions.hpp" #include "common/subgroup.hpp" #include "defines.hpp" -#include "descriptor_validate.hpp" #include "enums.hpp" #include "specialization_constant.hpp" #include "utils.hpp" @@ -764,7 +763,6 @@ class committed_descriptor_impl { PORTFFT_LOG_TRACE("local_memory_size:", local_memory_size); PORTFFT_LOG_TRACE("llc_size:", llc_size); - detail::validate::validate_descriptor(params); // compile the kernels and precalculate twiddles std::size_t n_kernels = params.lengths.size(); diff --git a/src/portfft/descriptor.hpp b/src/portfft/descriptor.hpp index 030a2a0d..71144778 100644 --- a/src/portfft/descriptor.hpp +++ b/src/portfft/descriptor.hpp @@ -27,10 +27,11 @@ #include #include +#include "committed_descriptor.hpp" #include "defines.hpp" +#include "descriptor_validate.hpp" #include "enums.hpp" -#include "committed_descriptor.hpp" namespace portfft { @@ -151,6 +152,7 @@ struct descriptor { */ committed_descriptor commit(sycl::queue& queue) { PORTFFT_LOG_FUNCTION_ENTRY(); + detail::validate::validate_descriptor(*this); return {*this, queue}; } From 30d8589c899e0a4d7230939643ec1785b7008341 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Thu, 1 Feb 2024 15:55:25 +0000 Subject: [PATCH 03/13] format --- src/portfft/committed_descriptor_impl.hpp | 1 - src/portfft/descriptor.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/src/portfft/committed_descriptor_impl.hpp b/src/portfft/committed_descriptor_impl.hpp index dce91cd0..5e916f39 100644 --- a/src/portfft/committed_descriptor_impl.hpp +++ b/src/portfft/committed_descriptor_impl.hpp @@ -763,7 +763,6 @@ class committed_descriptor_impl { PORTFFT_LOG_TRACE("local_memory_size:", local_memory_size); PORTFFT_LOG_TRACE("llc_size:", llc_size); - // compile the kernels and precalculate twiddles std::size_t n_kernels = params.lengths.size(); for (std::size_t i = 0; i < n_kernels; i++) { diff --git a/src/portfft/descriptor.hpp b/src/portfft/descriptor.hpp index 71144778..c885ed7a 100644 --- a/src/portfft/descriptor.hpp +++ b/src/portfft/descriptor.hpp @@ -32,7 +32,6 @@ #include "descriptor_validate.hpp" #include "enums.hpp" - namespace portfft { /** From cdf802c3685a04dc5d9ed6a728ba873db0724280 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Tue, 6 Feb 2024 12:11:45 +0000 Subject: [PATCH 04/13] test tidying --- src/portfft/enums.hpp | 14 ------------- test/common/reference_data_wrangler.hpp | 6 +++--- test/unit_test/fft_test_utils.hpp | 26 ++++++++++++------------ test/unit_test/instantiate_fft_tests.hpp | 17 ++++++++-------- 4 files changed, 24 insertions(+), 39 deletions(-) diff --git a/src/portfft/enums.hpp b/src/portfft/enums.hpp index 0240ec63..19dd2019 100644 --- a/src/portfft/enums.hpp +++ b/src/portfft/enums.hpp @@ -43,20 +43,6 @@ enum class pad { DONT_PAD, DO_PAD }; enum class level { WORKITEM, SUBGROUP, WORKGROUP, GLOBAL }; -constexpr const char* level_to_string(level l) { - switch (l) { - case level::WORKITEM: - return "WORKITEM"; - case level::SUBGROUP: - return "SUBGROUP"; - case level::WORKGROUP: - return "WORKGROUP"; - case level::GLOBAL: - return "GLOBAL"; - } - return "UNKNOWN"; -} - enum class layout { /// Packed layout represents default strides and distance. /// Each FFT is contiguous and each FFT is stored one after the other. diff --git a/test/common/reference_data_wrangler.hpp b/test/common/reference_data_wrangler.hpp index a35ec8eb..0e5e05ce 100644 --- a/test/common/reference_data_wrangler.hpp +++ b/test/common/reference_data_wrangler.hpp @@ -293,9 +293,9 @@ void verify_dft(const portfft::descriptor& desc, const std::vect } const auto dft_len = desc.get_flattened_length(); - const auto dft_offset = IsForward ? desc.backward_offset : desc.forward_offset; - const auto dft_stride = IsForward ? desc.backward_strides.back() : desc.forward_strides.back(); - const auto dft_distance = IsForward ? desc.backward_distance : desc.forward_distance; + const auto dft_offset = desc.get_offset(inv(Dir)); + const auto dft_stride = desc.get_strides(inv(Dir)).back(); + const auto dft_distance = desc.get_distance(inv(Dir)); for (std::size_t i = 0; i < dft_offset; ++i) { if (ref_output[i] != actual_output[i]) { diff --git a/test/unit_test/fft_test_utils.hpp b/test/unit_test/fft_test_utils.hpp index a8643b39..90941d68 100644 --- a/test/unit_test/fft_test_utils.hpp +++ b/test/unit_test/fft_test_utils.hpp @@ -98,7 +98,7 @@ struct test_params { std::optional backward_scale; std::optional forward_offset; std::optional backward_offset; - std::optional strides; + std::optional explicit_layout; test_params() = default; @@ -125,7 +125,7 @@ struct test_params { explicit test_params(layout_param_tuple params) : test_params(basic_param_tuple{std::get<0>(params), std::get<1>(params), std::get<2>(params), std::get<3>(params), std::get<4>(params).lengths}) { - strides = std::get<4>(params); + explicit_layout = std::get<4>(params); } }; @@ -145,17 +145,17 @@ struct test_params_print { ss << "__LayoutIn_" << layout_to_string(params.input_layout); ss << "__LayoutOut_" << layout_to_string(params.output_layout); - if (params.strides) { + if (params.explicit_layout) { ss << "__FwdStrides"; - for (std::size_t s : params.strides.value().forward_strides) { + for (std::size_t s : params.explicit_layout.value().forward_strides) { ss << "_" << s; } - ss << "__FwdDistance_" << params.strides.value().forward_distance; + ss << "__FwdDistance_" << params.explicit_layout.value().forward_distance; ss << "__BwdStrides"; - for (std::size_t s : params.strides.value().backward_strides) { + for (std::size_t s : params.explicit_layout.value().backward_strides) { ss << "_" << s; } - ss << "__BwdDistance_" << params.strides.value().backward_distance; + ss << "__BwdDistance_" << params.explicit_layout.value().backward_distance; } ss << "__Direction_" << (params.dir == direction::FORWARD ? "Fwd" : "Bwd"); @@ -238,12 +238,12 @@ auto get_descriptor(const test_params& params) { if (params.backward_offset) { desc.backward_offset = *params.backward_offset; } - if (params.strides) { - auto& strides = params.strides.value(); - desc.forward_strides = strides.forward_strides; - desc.forward_distance = strides.forward_distance; - desc.backward_strides = strides.backward_strides; - desc.backward_distance = strides.backward_distance; + if (params.explicit_layout) { + auto& explicit_layout = params.explicit_layout.value(); + desc.forward_strides = explicit_layout.forward_strides; + desc.forward_distance = explicit_layout.forward_distance; + desc.backward_strides = explicit_layout.backward_strides; + desc.backward_distance = explicit_layout.backward_distance; } return desc; } diff --git a/test/unit_test/instantiate_fft_tests.hpp b/test/unit_test/instantiate_fft_tests.hpp index 0a900bea..41601257 100644 --- a/test/unit_test/instantiate_fft_tests.hpp +++ b/test/unit_test/instantiate_fft_tests.hpp @@ -241,15 +241,14 @@ INSTANTIATE_TEST_SUITE_P(workItemStridedOOPInOrder, FFTTest, ::testing::Values(layout_params{{3}, {4}, {7}}, layout_params{{8}, {11}, {2}}, layout_params{{9}, {3}, {4}, 30, 40}))), test_params_print()); -INSTANTIATE_TEST_SUITE_P(workItemStridedOOPLikeBatchInterleaved, FFTTest, - ::testing::ConvertGenerator( - ::testing::Combine(oop_unpacked_unpacked_layout, both_directions, complex_storages, - // keep batches below 33 - ::testing::Values(1, 10, 33), - ::testing::Values(layout_params{{8}, {33}, {99}, 1, 3}, - layout_params{{8}, {33}, {2}, 1, 16}, - layout_params{{8}, {2}, {66}, 16, 2}))), - test_params_print()); +// The LikeBatchInterleaved tests must have stride >= number of transforms +INSTANTIATE_TEST_SUITE_P( + workItemStridedOOPLikeBatchInterleaved, FFTTest, + ::testing::ConvertGenerator(::testing::Combine( + oop_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(1, 10, 33), + ::testing::Values(layout_params{{8}, {33}, {99}, 1, 3}, layout_params{{8}, {33}, {2}, 1, 16}, + layout_params{{8}, {2}, {66}, 16, 2}))), + test_params_print()); INSTANTIATE_TEST_SUITE_P(workItemStridedIP, FFTTest, ::testing::ConvertGenerator(::testing::Combine( ip_unpacked_unpacked_layout, both_directions, complex_storages, From f8a8661520105d38262d149a3686e87054807ba7 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Tue, 6 Feb 2024 13:09:05 +0000 Subject: [PATCH 05/13] remove layout from workitem-dispatcher --- src/portfft/common/global.hpp | 8 +- .../dispatcher/workitem_dispatcher.hpp | 211 +++++++++--------- 2 files changed, 108 insertions(+), 111 deletions(-) diff --git a/src/portfft/common/global.hpp b/src/portfft/common/global.hpp index 5c4574df..dc0b96ce 100644 --- a/src/portfft/common/global.hpp +++ b/src/portfft/common/global.hpp @@ -151,10 +151,10 @@ PORTFFT_INLINE void dispatch_level(const Scalar* input, Scalar* output, const Sc IdxGlobal outer_batch_offset = get_outer_batch_offset(factors, inner_batches, inclusive_scan, num_factors, level_num, iter_value, outer_batch_product, storage); if (level == detail::level::WORKITEM) { - workitem_impl( - input + outer_batch_offset, output + outer_batch_offset, input_imag + outer_batch_offset, - output_imag + outer_batch_offset, input_loc, batch_size, global_data, kh, static_cast(nullptr), - store_modifier_data, static_cast(nullptr), store_modifier_loc); + workitem_impl(input + outer_batch_offset, output + outer_batch_offset, + input_imag + outer_batch_offset, output_imag + outer_batch_offset, input_loc, + batch_size, global_data, kh, static_cast(nullptr), + store_modifier_data, static_cast(nullptr), store_modifier_loc); } else if (level == detail::level::SUBGROUP) { subgroup_impl( input + outer_batch_offset, output + outer_batch_offset, input_imag + outer_batch_offset, diff --git a/src/portfft/dispatcher/workitem_dispatcher.hpp b/src/portfft/dispatcher/workitem_dispatcher.hpp index fa6f6261..53f2dd0e 100644 --- a/src/portfft/dispatcher/workitem_dispatcher.hpp +++ b/src/portfft/dispatcher/workitem_dispatcher.hpp @@ -77,8 +77,6 @@ PORTFFT_INLINE void apply_modifier(Idx num_elements, PrivT priv, const T* modifi /** * Implementation of FFT for sizes that can be done by independent work items. * - * @tparam LayoutIn Input Layout - * @tparam LayoutOut Output Layout * @tparam SubgroupSize size of the subgroup * @tparam T type of the scalar used for computations * @param input pointer to global memory containing input data. If complex storage (from @@ -98,7 +96,7 @@ PORTFFT_INLINE void apply_modifier(Idx num_elements, PrivT priv, const T* modifi * @param loc_load_modifier Pointer to load modifier data in local memory * @param loc_store_modifier Pointer to store modifier data in local memory */ -template +template PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag, T* output_imag, T* loc, IdxGlobal n_transforms, global_data_struct<1> global_data, sycl::kernel_handler& kh, const T* load_modifier_data = nullptr, const T* store_modifier_data = nullptr, @@ -118,6 +116,11 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag const IdxGlobal input_distance = kh.get_specialization_constant(); const IdxGlobal output_distance = kh.get_specialization_constant(); + const bool is_packed_input = input_stride == 1 && input_distance == fft_size; + const bool interleaved_transforms_input = input_distance < input_stride; + const bool is_packed_output = output_stride == 1 && output_distance == fft_size; + const bool interleaved_transforms_output = output_distance < output_stride; + global_data.log_message_global(__func__, "entered", "fft_size", fft_size, "n_transforms", n_transforms); bool interleaved_storage = storage == complex_storage::INTERLEAVED_COMPLEX; @@ -158,54 +161,50 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag // This is checking for LayoutIn is PACKED or UNPACKED but we don't actually ever launch kernels with LayoutIn // as UNPACKED. - if (LayoutIn == detail::layout::PACKED || LayoutIn == detail::layout::UNPACKED) { + if (is_packed_input) { // copy into local memory cooperatively as a subgroup, allowing coalesced memory access for when elements of a // single FFT are sequential. BATCH_INTERLEAVED skips this step and loads straight from global to registers since // the sequential work-items already access sequential elements. - if (input_stride == 1 && input_distance == fft_size) { - if (storage == complex_storage::INTERLEAVED_COMPLEX) { - global_data.log_message_global(__func__, "loading packed data from global to local memory"); - global2local(global_data, input, loc_view, n_reals * n_working, global_offset, - local_offset); - } else { - global_data.log_message_global(__func__, "loading packed real data from global to local memory"); - global2local(global_data, input, loc_view, fft_size * n_working, global_offset, - local_offset); - global_data.log_message_global(__func__, "loading packed imaginary data from global to local memory"); - global2local(global_data, input_imag, loc_view, fft_size * n_working, - global_offset, local_offset + local_imag_offset); - } + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + global_data.log_message_global(__func__, "loading packed data from global to local memory"); + global2local(global_data, input, loc_view, n_reals * n_working, global_offset, + local_offset); } else { - if (storage == complex_storage::INTERLEAVED_COMPLEX) { - // NOTE this is potentially less optimal when input_distance < input_stride. In that case, the data is not - // read is order of memory address and will potentially not be as coalesed as possible. For input_distance < - // input_stride, we could also skip loading to local memory, since each work-item will be only loading it's - // own data anyway. - std::array global_strides{input_distance * 2, input_stride * 2, 1}; - std::array local_strides{fft_size * 2, 2, 1}; - std::array copy_indices{n_working, fft_size, 2}; - - detail::md_view global_input_view{input, global_strides, global_input_offset}; - detail::md_view local_input_view{loc_view, local_strides, local_offset}; - - global_data.log_message_global(__func__, "loading unpacked data from global to local memory"); - copy_group(global_data, global_input_view, local_input_view, copy_indices); - } else { - std::array global_strides{input_distance, input_stride}; - std::array local_strides{fft_size, 1}; - std::array copy_indices{n_working, fft_size}; - - detail::md_view global_input_real_view{input, global_strides, global_input_offset}; - detail::md_view local_input_real_view{loc_view, local_strides, local_offset}; - detail::md_view global_input_imag_view{input_imag, global_strides, global_input_offset}; - detail::md_view local_input_imag_view{loc_view, local_strides, local_offset + local_imag_offset}; - - global_data.log_message_global(__func__, "loading unpacked real data from global to local memory"); - copy_group(global_data, global_input_real_view, local_input_real_view, copy_indices); - global_data.log_message_global(__func__, "loading unpacked imaginary data from global to local memory"); - copy_group(global_data, global_input_imag_view, local_input_imag_view, copy_indices); - } + global_data.log_message_global(__func__, "loading packed real data from global to local memory"); + global2local(global_data, input, loc_view, fft_size * n_working, global_offset, + local_offset); + global_data.log_message_global(__func__, "loading packed imaginary data from global to local memory"); + global2local(global_data, input_imag, loc_view, fft_size * n_working, + global_offset, local_offset + local_imag_offset); + } + } else if (!interleaved_transforms_input) { + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + std::array global_strides{input_distance * 2, input_stride * 2, 1}; + std::array local_strides{fft_size * 2, 2, 1}; + std::array copy_indices{n_working, fft_size, 2}; + + detail::md_view global_input_view{input, global_strides, global_input_offset}; + detail::md_view local_input_view{loc_view, local_strides, local_offset}; + + global_data.log_message_global(__func__, "loading unpacked data from global to local memory"); + copy_group(global_data, global_input_view, local_input_view, copy_indices); + } else { + std::array global_strides{input_distance, input_stride}; + std::array local_strides{fft_size, 1}; + std::array copy_indices{n_working, fft_size}; + + detail::md_view global_input_real_view{input, global_strides, global_input_offset}; + detail::md_view local_input_real_view{loc_view, local_strides, local_offset}; + detail::md_view global_input_imag_view{input_imag, global_strides, global_input_offset}; + detail::md_view local_input_imag_view{loc_view, local_strides, local_offset + local_imag_offset}; + + global_data.log_message_global(__func__, "loading unpacked real data from global to local memory"); + copy_group(global_data, global_input_real_view, local_input_real_view, copy_indices); + global_data.log_message_global(__func__, "loading unpacked imaginary data from global to local memory"); + copy_group(global_data, global_input_imag_view, local_input_imag_view, copy_indices); } + } + if (is_packed_input || !interleaved_transforms_input) { #ifdef PORTFFT_LOG_DUMPS sycl::group_barrier(global_data.sg); #endif @@ -215,16 +214,16 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag sycl::group_barrier(global_data.sg); if (working) { - if (LayoutIn == detail::layout::BATCH_INTERLEAVED) { + if (interleaved_transforms_input) { global_data.log_message_global(__func__, "loading transposed data from global to private memory"); - // Load directly into registers from global memory as all loads will be fully coalesced. + // Load directly into registers from global memory so work-items read from nearby memory addresses. // No need of going through local memory either as it is an unnecessary extra write step. if (storage == complex_storage::INTERLEAVED_COMPLEX) { - detail::strided_view input_view{input, n_transforms, i * 2}; + detail::strided_view input_view{input, input_stride, input_distance * i * 2}; copy_wi<2>(global_data, input_view, priv, fft_size); } else { - detail::strided_view input_real_view{input, n_transforms, i}; - detail::strided_view input_imag_view{input_imag, n_transforms, i}; + detail::strided_view input_real_view{input, input_stride, input_distance * i}; + detail::strided_view input_imag_view{input_imag, input_stride, input_distance * i}; detail::strided_view priv_real_view{priv, 2}; detail::strided_view priv_imag_view{priv, 2, 1}; copy_wi(global_data, input_real_view, priv_real_view, fft_size); @@ -277,7 +276,19 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag } global_data.log_dump_private("data in registers after scaling:", priv, n_reals); - if (LayoutOut == detail::layout::PACKED || LayoutOut == detail::layout::UNPACKED) { + if (interleaved_transforms_output) { + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + detail::strided_view output_view{output, output_stride, output_distance * i * 2}; + copy_wi<2>(global_data, priv, output_view, fft_size); + } else { + detail::strided_view priv_real_view{priv, 2}; + detail::strided_view priv_imag_view{priv, 2, 1}; + detail::strided_view output_real_view{output, output_stride, output_distance * i}; + detail::strided_view output_imag_view{output_imag, output_stride, output_distance * i}; + copy_wi(global_data, priv_real_view, output_real_view, fft_size); + copy_wi(global_data, priv_imag_view, output_imag_view, fft_size); + } + } else { global_data.log_message_global(__func__, "loading data from private to local memory"); if (storage == complex_storage::INTERLEAVED_COMPLEX) { detail::offset_view offset_local_view{loc_view, local_offset + subgroup_local_id * n_reals}; @@ -291,63 +302,49 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag copy_wi(global_data, priv_real_view, local_real_view, fft_size); copy_wi(global_data, priv_imag_view, local_imag_view, fft_size); } - } else { - if (storage == complex_storage::INTERLEAVED_COMPLEX) { - detail::strided_view output_view{output, n_transforms, i * 2}; - copy_wi<2>(global_data, priv, output_view, fft_size); - } else { - detail::strided_view priv_real_view{priv, 2}; - detail::strided_view priv_imag_view{priv, 2, 1}; - detail::strided_view output_real_view{output, n_transforms, i}; - detail::strided_view output_imag_view{output_imag, n_transforms, i}; - copy_wi(global_data, priv_real_view, output_real_view, fft_size); - copy_wi(global_data, priv_imag_view, output_imag_view, fft_size); - } } } - // This is checking for LayoutOut is PACKED or UNPACKED but we don't actually ever launch kernels with LayoutOut - // as UNPACKED, but just use PACKED and check the stride. - if (LayoutOut == detail::layout::PACKED || LayoutOut == detail::layout::UNPACKED) { + if (is_packed_output) { sycl::group_barrier(global_data.sg); global_data.log_dump_local("computed data local memory:", loc, n_reals * n_working); - if (output_stride == 1 && output_distance == fft_size) { - if (storage == complex_storage::INTERLEAVED_COMPLEX) { - global_data.log_message_global(__func__, "storing data from local to packed global memory"); - local2global(global_data, loc_view, output, n_reals * n_working, local_offset, - global_offset); - } else { - global_data.log_message_global(__func__, "storing real data from local to packed global memory"); - local2global(global_data, loc_view, output, fft_size * n_working, local_offset, - global_offset); - global_data.log_message_global(__func__, "storing imaginary data from local to packed global memory"); - local2global(global_data, loc_view, output_imag, fft_size * n_working, - local_offset + local_imag_offset, global_output_offset); - } + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + global_data.log_message_global(__func__, "storing data from local to packed global memory"); + local2global(global_data, loc_view, output, n_reals * n_working, local_offset, + global_offset); } else { - if (storage == complex_storage::INTERLEAVED_COMPLEX) { - std::array global_strides{output_distance * 2, output_stride * 2, 1}; - std::array local_strides{fft_size * 2, 2, 1}; - std::array copy_indices{n_working, fft_size, 2}; - - detail::md_view global_output_view{output, global_strides, global_output_offset}; - detail::md_view local_output_view{loc_view, local_strides, local_offset}; - global_data.log_message_global(__func__, "storing data from local to unpacked global memory"); - copy_group(global_data, local_output_view, global_output_view, copy_indices); - } else { - std::array global_strides{output_distance, output_stride}; - std::array local_strides{fft_size, 1}; - std::array copy_indices{n_working, fft_size}; - - detail::md_view global_output_real_view{output, global_strides, global_output_offset}; - detail::md_view local_output_real_view{loc_view, local_strides, local_offset}; - detail::md_view global_output_imag_view{output_imag, global_strides, global_output_offset}; - detail::md_view local_output_imag_view{loc_view, local_strides, local_offset + local_imag_offset}; - global_data.log_message_global(__func__, "storing real data from local to unpacked global memory"); - copy_group(global_data, local_output_real_view, global_output_real_view, copy_indices); - global_data.log_message_global(__func__, "storing imaginary data from local to unpacked global memory"); - copy_group(global_data, local_output_imag_view, global_output_imag_view, copy_indices); - } + global_data.log_message_global(__func__, "storing real data from local to packed global memory"); + local2global(global_data, loc_view, output, fft_size * n_working, local_offset, + global_offset); + global_data.log_message_global(__func__, "storing imaginary data from local to packed global memory"); + local2global(global_data, loc_view, output_imag, fft_size * n_working, + local_offset + local_imag_offset, global_output_offset); + } + } else if (!interleaved_transforms_output) { + if (storage == complex_storage::INTERLEAVED_COMPLEX) { + std::array global_strides{output_distance * 2, output_stride * 2, 1}; + std::array local_strides{fft_size * 2, 2, 1}; + std::array copy_indices{n_working, fft_size, 2}; + + detail::md_view global_output_view{output, global_strides, global_output_offset}; + detail::md_view local_output_view{loc_view, local_strides, local_offset}; + global_data.log_message_global(__func__, "storing data from local to unpacked global memory"); + copy_group(global_data, local_output_view, global_output_view, copy_indices); + } else { + std::array global_strides{output_distance, output_stride}; + std::array local_strides{fft_size, 1}; + std::array copy_indices{n_working, fft_size}; + + detail::md_view global_output_real_view{output, global_strides, global_output_offset}; + detail::md_view local_output_real_view{loc_view, local_strides, local_offset}; + detail::md_view global_output_imag_view{output_imag, global_strides, global_output_offset}; + detail::md_view local_output_imag_view{loc_view, local_strides, local_offset + local_imag_offset}; + global_data.log_message_global(__func__, "storing real data from local to unpacked global memory"); + copy_group(global_data, local_output_real_view, global_output_real_view, copy_indices); + global_data.log_message_global(__func__, "storing imaginary data from local to unpacked global memory"); + copy_group(global_data, local_output_imag_view, global_output_imag_view, copy_indices); } + } + if (is_packed_output || !interleaved_transforms_output) { sycl::group_barrier(global_data.sg); } } @@ -400,10 +397,10 @@ struct committed_descriptor_impl::run_kernel_struct( - &in_acc_or_usm[0] + input_offset, &out_acc_or_usm[0] + output_offset, - &in_imag_acc_or_usm[0] + input_offset, &out_imag_acc_or_usm[0] + output_offset, &loc[0], n_transforms, - global_data, kh); + detail::workitem_impl(&in_acc_or_usm[0] + input_offset, &out_acc_or_usm[0] + output_offset, + &in_imag_acc_or_usm[0] + input_offset, + &out_imag_acc_or_usm[0] + output_offset, &loc[0], n_transforms, + global_data, kh); global_data.log_message_global("Exiting workitem kernel"); }); }); From dee36f59096caa0830a422091fb5c09d8f8e7639 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Tue, 6 Feb 2024 14:04:44 +0000 Subject: [PATCH 06/13] update comments --- src/portfft/dispatcher/workitem_dispatcher.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/portfft/dispatcher/workitem_dispatcher.hpp b/src/portfft/dispatcher/workitem_dispatcher.hpp index 53f2dd0e..9ab4145f 100644 --- a/src/portfft/dispatcher/workitem_dispatcher.hpp +++ b/src/portfft/dispatcher/workitem_dispatcher.hpp @@ -159,12 +159,10 @@ PORTFFT_INLINE void workitem_impl(const T* input, T* output, const T* input_imag IdxGlobal global_input_offset = static_cast(input_distance_in_reals) * leader_i; IdxGlobal global_output_offset = static_cast(output_distance_in_reals) * leader_i; - // This is checking for LayoutIn is PACKED or UNPACKED but we don't actually ever launch kernels with LayoutIn - // as UNPACKED. if (is_packed_input) { // copy into local memory cooperatively as a subgroup, allowing coalesced memory access for when elements of a - // single FFT are sequential. BATCH_INTERLEAVED skips this step and loads straight from global to registers since - // the sequential work-items already access sequential elements. + // single FFT are sequential. When distance < stride, skip this step and load straight from global to registers + // since the sequential work-items already access sequential elements. if (storage == complex_storage::INTERLEAVED_COMPLEX) { global_data.log_message_global(__func__, "loading packed data from global to local memory"); global2local(global_data, input, loc_view, n_reals * n_working, global_offset, From 3f12710323eb6a6a52d207a85da97d6af35d0467 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Tue, 6 Feb 2024 14:46:29 +0000 Subject: [PATCH 07/13] clarify distance for 1d kernel launch --- src/portfft/committed_descriptor_impl.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/portfft/committed_descriptor_impl.hpp b/src/portfft/committed_descriptor_impl.hpp index 5e916f39..a98bc752 100644 --- a/src/portfft/committed_descriptor_impl.hpp +++ b/src/portfft/committed_descriptor_impl.hpp @@ -963,11 +963,13 @@ class committed_descriptor_impl { std::size_t outer_size = total_size / params.lengths.back(); std::size_t input_stride_0 = input_strides.back(); std::size_t output_stride_0 = output_strides.back(); + std::size_t input_distance_0 = n_dimensions > 1 ? params.lengths.back() : input_distance; + std::size_t output_distance_0 = n_dimensions > 1 ? params.lengths.back() : output_distance; PORTFFT_LOG_TRACE("Dispatching the kernel for the last dimension"); sycl::event previous_event = dispatch_kernel_1d(in, out, in_imag, out_imag, dependencies, params.number_of_transforms * outer_size, - input_stride_0, output_stride_0, input_distance / outer_size, output_distance / outer_size, + input_stride_0, output_stride_0, input_distance_0, output_distance_0, input_offset, output_offset, dimensions.back(), compute_direction); if (n_dimensions == 1) { return previous_event; @@ -1087,19 +1089,18 @@ class committed_descriptor_impl { } } - // UNPACKED layout is also being dispatched as PACKED layout - const bool is_in_place = in == out; + // UNPACKED is also being dispatched as PACKED, but kernels that support packed don't use the layout template parameter. if (!input_batch_interleaved && !output_batch_interleaved) { return run_kernel( in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, compute_direction); } - if (input_batch_interleaved && !output_batch_interleaved && !is_in_place) { + if (input_batch_interleaved && !output_batch_interleaved && in != out) { return run_kernel( in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, compute_direction); } - if (!input_batch_interleaved && output_batch_interleaved && !is_in_place) { + if (!input_batch_interleaved && output_batch_interleaved && in != out) { return run_kernel( in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, compute_direction); From fd9d7ccc5d6ec37fe673210c00a58ec5a159b7ff Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Tue, 6 Feb 2024 14:52:03 +0000 Subject: [PATCH 08/13] format --- src/portfft/committed_descriptor_impl.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/portfft/committed_descriptor_impl.hpp b/src/portfft/committed_descriptor_impl.hpp index a98bc752..23c69af9 100644 --- a/src/portfft/committed_descriptor_impl.hpp +++ b/src/portfft/committed_descriptor_impl.hpp @@ -969,8 +969,8 @@ class committed_descriptor_impl { PORTFFT_LOG_TRACE("Dispatching the kernel for the last dimension"); sycl::event previous_event = dispatch_kernel_1d(in, out, in_imag, out_imag, dependencies, params.number_of_transforms * outer_size, - input_stride_0, output_stride_0, input_distance_0, output_distance_0, - input_offset, output_offset, dimensions.back(), compute_direction); + input_stride_0, output_stride_0, input_distance_0, output_distance_0, input_offset, + output_offset, dimensions.back(), compute_direction); if (n_dimensions == 1) { return previous_event; } @@ -1089,7 +1089,8 @@ class committed_descriptor_impl { } } - // UNPACKED is also being dispatched as PACKED, but kernels that support packed don't use the layout template parameter. + // UNPACKED is also being dispatched as PACKED, but kernels that support packed don't use the layout template + // parameter. if (!input_batch_interleaved && !output_batch_interleaved) { return run_kernel( in, out, in_imag, out_imag, dependencies, n_transforms, input_offset, output_offset, dimension_data, From 13d0b07cb4cc614f9dfd75dcb5e11bd1cfe4ef1e Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Wed, 7 Feb 2024 10:26:05 +0000 Subject: [PATCH 09/13] add check that strided ffts fit in workitem --- README.md | 1 + src/portfft/descriptor_validate.hpp | 48 ++++++++++++++++------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index f9377401..9e9c684e 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,7 @@ portFFT is still in early development. The supported configurations are: * size in each dimension must be supported by 1D transforms * Arbitrary forward and backward scales * Arbitrary forward and backward offsets +* Arbitrary strides and distance where the problem size + auxilary data fits in the registers of a single work-item. Any 1D arbitrarily large input size that fits in global memory is supported, with a restriction that large input sizes should not have large prime factors. The largest prime factor depend on the device and the values set by `PORTFFT_REGISTERS_PER_WI` and `PORTFFT_SUBGROUP_SIZES`. diff --git a/src/portfft/descriptor_validate.hpp b/src/portfft/descriptor_validate.hpp index 3979885d..a69f118d 100644 --- a/src/portfft/descriptor_validate.hpp +++ b/src/portfft/descriptor_validate.hpp @@ -24,17 +24,16 @@ #include #include "common/exceptions.hpp" +#include "common/workitem.hpp" #include "enums.hpp" #include "utils.hpp" namespace portfft::detail::validate { -namespace detail { - /** * Throw an exception if the lengths are invalid when looked at in isolation. * - * @param lengths the dimensions of the tranform + * @param lengths the dimensions of the transform */ inline void validate_lengths(const std::vector& lengths) { if (lengths.empty()) { @@ -50,10 +49,12 @@ inline void validate_lengths(const std::vector& lengths) { /** * Throw an exception if the layout is unsupported. * - * @param lengths the dimensions of the tranform + * @tparam Scalar the scalar type for the transform + * @param lengths the dimensions of the transform * @param forward_layout the layout of the forward domain * @param backward_layout the layout of the backward domain */ +template inline void validate_layout(const std::vector& lengths, portfft::detail::layout forward_layout, portfft::detail::layout backward_layout) { if (lengths.size() > 1) { @@ -63,13 +64,20 @@ inline void validate_layout(const std::vector& lengths, portfft::de throw unsupported_configuration("Multi-dimensional transforms are only supported with default data layout"); } } + if (forward_layout == portfft::detail::layout::UNPACKED || backward_layout == portfft::detail::layout::UNPACKED) { + if (!portfft::detail::fits_in_wi(lengths.back())) { + throw unsupported_configuration( + "Arbitrary strides and distances are only supported for sizes that fit in the registers of a single " + "work-item"); + } + } } /** * Throw an exception if individual stride, distance and number_of_transforms values are invalid/inconsistent. * - * @param lengths the dimensions of the tranform - * @param number_of_tranforms the number of batches + * @param lengths the dimensions of the transform + * @param number_of_transforms the number of batches * @param strides the strides between elements in a domain * @param distance the distance between batches in a domain * @param domain_str a string with the name of the domain being validated @@ -99,8 +107,8 @@ inline void validate_strides_distance_basic(const std::vector& leng * For multidimensional transforms, check that the strides are large enough so there will not be overlap within a single * batch. Throw when the strides are not big enough. This accounts for layouts like batch interleaved. * - * @param lengths the dimensions of the tranform - * @param number_of_tranforms the number of batches + * @param lengths the dimensions of the transform + * @param number_of_transforms the number of batches * @param strides the strides between elements in a domain * @param distance the distance between batches in a domain * @param domain_str a string with the name of the domain being validated @@ -138,8 +146,8 @@ inline void strides_distance_multidim_check(const std::vector& leng /** * Check that batches of 1D FFTs don't overlap. * - * @param lengths the dimensions of the tranform - * @param number_of_tranforms the number of batches + * @param lengths the dimensions of the transform + * @param number_of_transforms the number of batches * @param strides the strides between elements in a domain * @param distance the distance between batches in a domain * @param domain_str a string with the name of the domain being validated @@ -190,8 +198,8 @@ inline void strides_distance_1d_check(const std::vector& lengths, s /** * Throw an exception if the given strides and distance are invalid for a single domain. * - * @param lengths the dimensions of the tranform - * @param number_of_tranforms the number of batches + * @param lengths the dimensions of the transform + * @param number_of_transforms the number of batches * @param strides the strides between elements in a domain * @param distance the distance between batches in a domain * @param domain_str a string with the name of the domain being validated @@ -211,8 +219,8 @@ inline void strides_distance_check(const std::vector& lengths, std: * Throw an exception if the given strides and distances are invalid for either domain. * * @param place where the result is written with respect to where it is read (in-place vs not in-place) - * @param lengths the dimensions of the tranform - * @param number_of_tranforms the number of batches + * @param lengths the dimensions of the transform + * @param number_of_transforms the number of batches * @param forward_strides the strides between elements in the forward domain * @param backward_strides the strides between elements in the backward domain * @param forward_distance the distance between batches in the forward domain @@ -235,7 +243,6 @@ inline void validate_strides_distance(placement place, const std::vector(params.lengths, portfft::detail::get_layout(params, direction::FORWARD), + portfft::detail::get_layout(params, direction::BACKWARD)); } } // namespace portfft::detail::validate From 39101bc0679a056f386f972d9c5f1b4a532d91be Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Wed, 7 Feb 2024 16:44:32 +0000 Subject: [PATCH 10/13] clarified README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9e9c684e..fa0b3cb4 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,8 @@ Any batch size is supported as long as the input and output data fits in global By default the library assumes subgroup size of 32 is used. If that is not supported by the device it is running on, the subgroup size can be set using `PORTFFT_SUBGROUP_SIZES`. +Configurations that attempt to read from the same memory address from two separate batches of a transform are not supported. + ## Known issues * portFFT relies on SYCL specialization constants which have some limitations currently: From 3672ad563da0c6940ab52fd4c0d248bedf403e78 Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Thu, 8 Feb 2024 09:53:53 +0000 Subject: [PATCH 11/13] clarify the use of stride and distance in dispatch_kernel_1d --- src/portfft/committed_descriptor_impl.hpp | 81 +++++++++-------------- 1 file changed, 31 insertions(+), 50 deletions(-) diff --git a/src/portfft/committed_descriptor_impl.hpp b/src/portfft/committed_descriptor_impl.hpp index 23c69af9..1366597c 100644 --- a/src/portfft/committed_descriptor_impl.hpp +++ b/src/portfft/committed_descriptor_impl.hpp @@ -903,13 +903,11 @@ class committed_descriptor_impl { "INTERLEAVED_COMPLEX."); } if (compute_direction == direction::FORWARD) { - return dispatch_dimensions(in, out, in_imag, out_imag, dependencies, params.forward_strides, - params.backward_strides, params.forward_distance, params.backward_distance, - params.forward_offset, params.backward_offset, compute_direction); + return dispatch_dimensions(in, out, in_imag, out_imag, dependencies, params.forward_offset, + params.backward_offset, compute_direction); } - return dispatch_dimensions(in, out, in_imag, out_imag, dependencies, params.backward_strides, - params.forward_strides, params.backward_distance, params.forward_distance, - params.backward_offset, params.forward_offset, compute_direction); + return dispatch_dimensions(in, out, in_imag, out_imag, dependencies, params.backward_offset, params.forward_offset, + compute_direction); } /** @@ -926,10 +924,6 @@ class committed_descriptor_impl { * @param out_imag buffer or USM pointer to memory containing imaginary part of the output data. Ignored if * `descriptor.complex_storage` is interleaved. * @param dependencies events that must complete before the computation - * @param input_strides strides between input elements for each dimension of one FFT - * @param output_strides strides between output elements for each dimension of one FFT - * @param input_distance distance between the starts of input data for two consecutive FFTs - * @param output_distance distance between the starts of output data for two consecutive FFTs * @param input_offset offset into input allocation where the data for FFTs start * @param output_offset offset into output allocation where the data for FFTs start * @param compute_direction direction of compute, forward / backward @@ -937,22 +931,18 @@ class committed_descriptor_impl { */ template sycl::event dispatch_dimensions(const TIn& in, TOut& out, const TIn& in_imag, TOut& out_imag, - const std::vector& dependencies, - const std::vector& input_strides, - const std::vector& output_strides, std::size_t input_distance, - std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, - direction compute_direction) { + const std::vector& dependencies, std::size_t input_offset, + std::size_t output_offset, direction compute_direction) { PORTFFT_LOG_FUNCTION_ENTRY(); using TOutConst = std::conditional_t, const std::remove_pointer_t*, const TOut>; std::size_t n_dimensions = params.lengths.size(); std::size_t total_size = params.get_flattened_length(); - const auto forward_layout = detail::get_layout(params, direction::FORWARD); - const auto backward_layout = detail::get_layout(params, direction::BACKWARD); + const auto input_layout = detail::get_layout(params, compute_direction); + const auto output_layout = detail::get_layout(params, inv(compute_direction)); // currently multi-dimensional transforms are implemented just for default (PACKED) data layout - const bool multi_dim_supported = - forward_layout == detail::layout::PACKED && backward_layout == detail::layout::PACKED; + const bool multi_dim_supported = input_layout == detail::layout::PACKED && output_layout == detail::layout::PACKED; if (n_dimensions != 1 && !multi_dim_supported) { throw internal_error("Only default layout is supported for multi-dimensional transforms."); } @@ -961,16 +951,11 @@ class committed_descriptor_impl { std::size_t inner_size = 1; // product of sizes of all dimension outer relative to the one we are currently working on std::size_t outer_size = total_size / params.lengths.back(); - std::size_t input_stride_0 = input_strides.back(); - std::size_t output_stride_0 = output_strides.back(); - std::size_t input_distance_0 = n_dimensions > 1 ? params.lengths.back() : input_distance; - std::size_t output_distance_0 = n_dimensions > 1 ? params.lengths.back() : output_distance; PORTFFT_LOG_TRACE("Dispatching the kernel for the last dimension"); - sycl::event previous_event = - dispatch_kernel_1d(in, out, in_imag, out_imag, dependencies, params.number_of_transforms * outer_size, - input_stride_0, output_stride_0, input_distance_0, output_distance_0, input_offset, - output_offset, dimensions.back(), compute_direction); + sycl::event previous_event = dispatch_kernel_1d( + in, out, in_imag, out_imag, dependencies, params.number_of_transforms * outer_size, input_layout, output_layout, + input_offset, output_offset, dimensions.back(), compute_direction); if (n_dimensions == 1) { return previous_event; } @@ -985,9 +970,9 @@ class committed_descriptor_impl { PORTFFT_LOG_TRACE("Dispatching the kernels for the dimension", i); for (std::size_t j = 0; j < params.number_of_transforms * outer_size; j++) { sycl::event e = dispatch_kernel_1d( - out, out, out_imag, out_imag, previous_events, inner_size, inner_size, inner_size, 1, 1, - output_offset + j * stride_between_kernels, output_offset + j * stride_between_kernels, dimensions[i], - compute_direction); + out, out, out_imag, out_imag, previous_events, inner_size, layout::BATCH_INTERLEAVED, + layout::BATCH_INTERLEAVED, output_offset + j * stride_between_kernels, + output_offset + j * stride_between_kernels, dimensions[i], compute_direction); next_events.push_back(e); } inner_size *= params.lengths[i]; @@ -1012,10 +997,8 @@ class committed_descriptor_impl { * `descriptor.complex_storage` is interleaved. * @param dependencies events that must complete before the computation * @param n_transforms number of FT transforms to do in one call - * @param input_stride stride between input elements of one FFT - * @param output_stride stride between output elements of one FFT - * @param input_distance distance between the starts of input data for two consecutive FFTs - * @param output_distance distance between the starts of output data for two consecutive FFTs + * @param input_layout the layout of the input data of the transforms + * @param output_layout the layout of the output data of the transforms * @param input_offset offset into input allocation where the data for FFTs start * @param output_offset offset into output allocation where the data for FFTs start * @param dimension_data data for the dimension this call will work on @@ -1025,13 +1008,13 @@ class committed_descriptor_impl { template sycl::event dispatch_kernel_1d(const TIn& in, TOut& out, const TIn& in_imag, TOut& out_imag, const std::vector& dependencies, std::size_t n_transforms, - std::size_t input_stride, std::size_t output_stride, std::size_t input_distance, - std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, - dimension_struct& dimension_data, direction compute_direction) { + layout input_layout, layout output_layout, std::size_t input_offset, + std::size_t output_offset, dimension_struct& dimension_data, + direction compute_direction) { PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch_kernel_1d_helper( - in, out, in_imag, out_imag, dependencies, n_transforms, input_stride, output_stride, input_distance, - output_distance, input_offset, output_offset, dimension_data, compute_direction); + in, out, in_imag, out_imag, dependencies, n_transforms, input_layout, output_layout, input_offset, + output_offset, dimension_data, compute_direction); } /** @@ -1051,10 +1034,8 @@ class committed_descriptor_impl { * `descriptor.complex_storage` is interleaved. * @param dependencies events that must complete before the computation * @param n_transforms number of FT transforms to do in one call - * @param input_stride stride between input elements of one FFT - * @param output_stride stride between output elements of one FFT - * @param input_distance distance between the starts of input data for two consecutive FFTs - * @param output_distance distance between the starts of output data for two consecutive FFTs + * @param input_layout the layout of the input data of the transforms + * @param output_layout the layout of the output data of the transforms * @param input_offset offset into input allocation where the data for FFTs start * @param output_offset offset into output allocation where the data for FFTs start * @param dimension_data data for the dimension this call will work on @@ -1064,14 +1045,14 @@ class committed_descriptor_impl { template sycl::event dispatch_kernel_1d_helper(const TIn& in, TOut& out, const TIn& in_imag, TOut& out_imag, const std::vector& dependencies, std::size_t n_transforms, - std::size_t input_stride, std::size_t output_stride, std::size_t input_distance, - std::size_t output_distance, std::size_t input_offset, + layout input_layout, layout output_layout, std::size_t input_offset, std::size_t output_offset, dimension_struct& dimension_data, direction compute_direction) { PORTFFT_LOG_FUNCTION_ENTRY(); if (SubgroupSize == dimension_data.used_sg_size) { - const bool input_batch_interleaved = input_distance == 1 && input_stride == n_transforms; - const bool output_batch_interleaved = output_distance == 1 && output_stride == n_transforms; + const bool input_batch_interleaved = input_layout == layout::BATCH_INTERLEAVED; + const bool output_batch_interleaved = output_layout == layout::BATCH_INTERLEAVED; + for (kernel_data_struct kernel_data : dimension_data.forward_kernels) { std::size_t minimum_local_mem_required; if (input_batch_interleaved) { @@ -1089,7 +1070,7 @@ class committed_descriptor_impl { } } - // UNPACKED is also being dispatched as PACKED, but kernels that support packed don't use the layout template + // UNPACKED is also being dispatched as PACKED, but kernels that support UNPACKED don't use the layout template // parameter. if (!input_batch_interleaved && !output_batch_interleaved) { return run_kernel( @@ -1117,8 +1098,8 @@ class committed_descriptor_impl { throw invalid_configuration("None of the compiled subgroup sizes are supported by the device!"); } else { return dispatch_kernel_1d_helper( - in, out, in_imag, out_imag, dependencies, n_transforms, input_stride, output_stride, input_distance, - output_distance, input_offset, output_offset, dimension_data, compute_direction); + in, out, in_imag, out_imag, dependencies, n_transforms, input_layout, output_layout, input_offset, + output_offset, dimension_data, compute_direction); } } From c30f03068b9bdbfc8217642d2cbb1530bdaac25f Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Thu, 8 Feb 2024 12:15:46 +0000 Subject: [PATCH 12/13] added shortcut validation for batch_interleaved --- src/portfft/descriptor_validate.hpp | 3 ++- test/unit_test/instantiate_fft_tests.hpp | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/portfft/descriptor_validate.hpp b/src/portfft/descriptor_validate.hpp index a69f118d..26e9dfc9 100644 --- a/src/portfft/descriptor_validate.hpp +++ b/src/portfft/descriptor_validate.hpp @@ -170,7 +170,8 @@ inline void strides_distance_1d_check(const std::vector& lengths, s const std::size_t stride = strides[0]; const std::size_t first_batch_limit = stride * fft_size; - if (first_batch_limit <= distance) { + const std::size_t first_length_limit = distance * number_of_transforms; + if ((stride <= distance && first_batch_limit <= distance) || (distance <= stride && first_length_limit <= stride)) { return; } diff --git a/test/unit_test/instantiate_fft_tests.hpp b/test/unit_test/instantiate_fft_tests.hpp index 41601257..94c74130 100644 --- a/test/unit_test/instantiate_fft_tests.hpp +++ b/test/unit_test/instantiate_fft_tests.hpp @@ -261,6 +261,13 @@ INSTANTIATE_TEST_SUITE_P( ip_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(1, 3, 33), ::testing::Values(layout_params{{3}, {66}, {66}, 2, 2}, layout_params{{6}, {40}, {40}, 1, 1}))), test_params_print()); +// these layouts are only valid because there is only a single batch +INSTANTIATE_TEST_SUITE_P(WorkItemStridedStrideEqualsDistance, FFTTest, + ::testing::ConvertGenerator(::testing::Combine( + all_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(1), + ::testing::Values(layout_params{{8}, {2}, {2}, 2, 2}, + layout_params{{8}, {1}, {1}, 1, 1}))), + test_params_print()); // clang-format off // Arbitrary interleaved FFT test suites @@ -322,6 +329,12 @@ INSTANTIATE_TEST_SUITE_P(InvalidOverlapLarge, InvalidFFTTest, ::testing::Values(3333334), ::testing::Values(layout_params{{8}, {3333333}, {3333333}, 1, 1}))), test_params_print()); +INSTANTIATE_TEST_SUITE_P(InvalidStrideEqualsDistance, InvalidFFTTest, + ::testing::ConvertGenerator(::testing::Combine( + oop_unpacked_unpacked_layout, both_directions, complex_storages, ::testing::Values(2), + ::testing::Values(layout_params{{8}, {2}, {2}, 2, 2}, + layout_params{{8}, {1}, {1}, 1, 1}))), + test_params_print()); #define INSTANTIATE_TESTS_FULL(TYPE, MEMORY) \ TEST_P(FFTTest, TYPE##_##MEMORY##_C2C) { \ From c1afe0b5cc328b5383a42eaa87d29fcd7b4a9e9e Mon Sep 17 00:00:00 2001 From: Finlay Marno Date: Thu, 8 Feb 2024 12:22:21 +0000 Subject: [PATCH 13/13] rename descriptor_validate to descriptor_validation --- src/portfft/descriptor.hpp | 2 +- .../{descriptor_validate.hpp => descriptor_validation.hpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/portfft/{descriptor_validate.hpp => descriptor_validation.hpp} (100%) diff --git a/src/portfft/descriptor.hpp b/src/portfft/descriptor.hpp index c885ed7a..2e0025d1 100644 --- a/src/portfft/descriptor.hpp +++ b/src/portfft/descriptor.hpp @@ -29,7 +29,7 @@ #include "committed_descriptor.hpp" #include "defines.hpp" -#include "descriptor_validate.hpp" +#include "descriptor_validation.hpp" #include "enums.hpp" namespace portfft { diff --git a/src/portfft/descriptor_validate.hpp b/src/portfft/descriptor_validation.hpp similarity index 100% rename from src/portfft/descriptor_validate.hpp rename to src/portfft/descriptor_validation.hpp