diff --git a/src/aten/RangeFactories.cpp b/src/aten/RangeFactories.cpp new file mode 100644 index 000000000..16aa31a8a --- /dev/null +++ b/src/aten/RangeFactories.cpp @@ -0,0 +1,20 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +Tensor& XPUNativeFunctions::arange_out( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& out) { + return at::native::xpu::arange_kernel(start, end, step, out); +} + +} // namespace at diff --git a/src/aten/sycl/RangeFactoriesKernel.cpp b/src/aten/sycl/RangeFactoriesKernel.cpp new file mode 100644 index 000000000..0cecbdb36 --- /dev/null +++ b/src/aten/sycl/RangeFactoriesKernel.cpp @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace xpu { + +constexpr int nitem_per_wg = 256; +constexpr int item_work_size = 1; +constexpr int group_work_size = item_work_size * nitem_per_wg; + +template +struct ElementwiseKernelWithIndexFunctor { + using res_t = typename function_traits::result_type; + void operator()(sycl::nd_item<1> item) const { +#pragma unroll + for (int i = 0; i < item_work_size; i++) { + index_t idx = group_work_size * item.get_group(0) + nitem_per_wg * i + + item.get_local_id(0); + if (idx < N_) { + data_[idx] = f_(idx); + } + } + } + ElementwiseKernelWithIndexFunctor(index_t N, func_t f, res_t* data) + : N_(N), f_(f), data_(data) {} + + private: + index_t N_; + func_t f_; + res_t* data_; +}; + +template +void gpu_kernel_with_index(at::Tensor& output, func_t f) { + int64_t N = output.numel(); + if (N == 0) { + return; + } + int64_t num_wg = (N + group_work_size - 1) / group_work_size; + auto queue = at::xpu::getCurrentSYCLQueue(); + using scalar_t = typename function_traits::result_type; + if (N <= std::numeric_limits::max()) { + auto caller = ElementwiseKernelWithIndexFunctor( + N, f, output.mutable_data_ptr()); + sycl_kernel_submit(num_wg * nitem_per_wg, nitem_per_wg, queue, caller); + } else { + auto caller = ElementwiseKernelWithIndexFunctor( + N, f, output.mutable_data_ptr()); + sycl_kernel_submit(num_wg * nitem_per_wg, nitem_per_wg, queue, caller); + } +} + +template +struct ArangeFunctor { + scalar_t operator()(int64_t ind) const { + accscalar_t inc = xstep_ * static_cast(ind); + accscalar_t val = xstart_ + inc; + return static_cast(val); + } + ArangeFunctor(accscalar_t xstart, accscalar_t xstep) + : xstart_(xstart), xstep_(xstep) {} + + private: + accscalar_t xstart_; + accscalar_t xstep_; +}; + +Tensor& arange_kernel( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& result) { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + result.scalar_type(), + "arange_xpu", + [&]() { + using accscalar_t = at::acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(static_cast(xstart)) && + std::isfinite(static_cast(xend)), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || + ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + + // we use double precision for (start - end) / step + // to compute size_d for consistency across devices. + // The problem with using accscalar_t is that accscalar_t might be + // float32 on gpu for a float32 scalar_t, but double on cpu for the + // same, and the effective output size starts differing on CPU vs GPU + // because of precision issues, which we dont want. the corner-case we + // do want to take into account is int64_t, which has higher precision + // than double + double size_d; + if constexpr (std::is_same_v) { + int64_t sgn = (xstep > 0) - (xstep < 0); + size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); + } else { + size_d = std::ceil( + static_cast(end.to() - start.to()) / + step.to()); + } + + TORCH_CHECK( + size_d >= 0 && + size_d <= + static_cast(std::numeric_limits::max()), + "invalid size, possible overflow?"); + int64_t size = static_cast(size_d); + int64_t numel = result.numel(); + + if (numel != size) { + if (numel > 0) { + TORCH_WARN( + "The number of elements in the out tensor of shape ", + result.sizes(), + " is ", + numel, + " which does not match the computed number of elements ", + size, + ". Note that this may occur as a result of rounding error. " + "The out tensor will be resized to a tensor of shape (", + size, + ",)."); + } + result.resize_({size}); + } + bool is_contiguous = result.is_contiguous(); + Tensor r = !is_contiguous + ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) + : result; + + auto f = ArangeFunctor(xstart, xstep); + gpu_kernel_with_index(r, f); + + if (!is_contiguous) { + result.copy_(r); + } + }); + + return result; +} + +} // namespace xpu +} // namespace native +} // namespace at diff --git a/src/aten/sycl/RangeFactoriesKernel.h b/src/aten/sycl/RangeFactoriesKernel.h new file mode 100644 index 000000000..26ca6197d --- /dev/null +++ b/src/aten/sycl/RangeFactoriesKernel.h @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace at { +namespace native { +namespace xpu { + +Tensor& arange_kernel( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& result); + +} // namespace xpu +} // namespace native +} // namespace at diff --git a/test/xpu/test_ops.py b/test/xpu/test_ops.py index f40dc4fe0..315dc4d5c 100644 --- a/test/xpu/test_ops.py +++ b/test/xpu/test_ops.py @@ -70,6 +70,7 @@ "nn.functional.threshold", "nn.functional.relu", "nn.functional.gelu", + "arange", ] _xpu_tensor_factory_op_list = [ "normal", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 06d3441c3..f84c249d3 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -96,6 +96,7 @@ supported: - gelu.out - gelu_backward - gelu_backward.grad_input + - arange.start_out - abs - abs_ - abs.out