-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Feng Yuan <[email protected]> Co-authored-by: Feng Yuan <[email protected]>
- Loading branch information
1 parent
0e1beb7
commit ca71ab0
Showing
5 changed files
with
201 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#include <ATen/ATen.h> | ||
#include <ATen/ExpandUtils.h> | ||
#include <ATen/ScalarOps.h> | ||
#include <ATen/XPUNativeFunctions.h> | ||
#include <ATen/core/Tensor.h> | ||
#include <ATen/native/TensorIterator.h> | ||
#include <aten/sycl/RangeFactoriesKernel.h> | ||
#include <torch/library.h> | ||
|
||
namespace at { | ||
|
||
Tensor& XPUNativeFunctions::arange_out( | ||
const Scalar& start, | ||
const Scalar& end, | ||
const Scalar& step, | ||
Tensor& out) { | ||
return at::native::xpu::arange_kernel(start, end, step, out); | ||
} | ||
|
||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
#include <ATen/ATen.h> | ||
#include <ATen/AccumulateType.h> | ||
#include <ATen/Dispatch.h> | ||
#include <ATen/core/Tensor.h> | ||
#include <ATen/detail/FunctionTraits.h> | ||
#include <comm/SYCLContext.h> | ||
|
||
namespace at { | ||
namespace native { | ||
namespace xpu { | ||
|
||
constexpr int nitem_per_wg = 256; | ||
constexpr int item_work_size = 1; | ||
constexpr int group_work_size = item_work_size * nitem_per_wg; | ||
|
||
template <typename index_t, typename func_t> | ||
struct ElementwiseKernelWithIndexFunctor { | ||
using res_t = typename function_traits<func_t>::result_type; | ||
void operator()(sycl::nd_item<1> item) const { | ||
#pragma unroll | ||
for (int i = 0; i < item_work_size; i++) { | ||
index_t idx = group_work_size * item.get_group(0) + nitem_per_wg * i + | ||
item.get_local_id(0); | ||
if (idx < N_) { | ||
data_[idx] = f_(idx); | ||
} | ||
} | ||
} | ||
ElementwiseKernelWithIndexFunctor(index_t N, func_t f, res_t* data) | ||
: N_(N), f_(f), data_(data) {} | ||
|
||
private: | ||
index_t N_; | ||
func_t f_; | ||
res_t* data_; | ||
}; | ||
|
||
template <typename func_t> | ||
void gpu_kernel_with_index(at::Tensor& output, func_t f) { | ||
int64_t N = output.numel(); | ||
if (N == 0) { | ||
return; | ||
} | ||
int64_t num_wg = (N + group_work_size - 1) / group_work_size; | ||
auto queue = at::xpu::getCurrentSYCLQueue(); | ||
using scalar_t = typename function_traits<func_t>::result_type; | ||
if (N <= std::numeric_limits<int>::max()) { | ||
auto caller = ElementwiseKernelWithIndexFunctor<int, func_t>( | ||
N, f, output.mutable_data_ptr<scalar_t>()); | ||
sycl_kernel_submit(num_wg * nitem_per_wg, nitem_per_wg, queue, caller); | ||
} else { | ||
auto caller = ElementwiseKernelWithIndexFunctor<int64_t, func_t>( | ||
N, f, output.mutable_data_ptr<scalar_t>()); | ||
sycl_kernel_submit(num_wg * nitem_per_wg, nitem_per_wg, queue, caller); | ||
} | ||
} | ||
|
||
template <typename scalar_t, typename accscalar_t> | ||
struct ArangeFunctor { | ||
scalar_t operator()(int64_t ind) const { | ||
accscalar_t inc = xstep_ * static_cast<accscalar_t>(ind); | ||
accscalar_t val = xstart_ + inc; | ||
return static_cast<scalar_t>(val); | ||
} | ||
ArangeFunctor(accscalar_t xstart, accscalar_t xstep) | ||
: xstart_(xstart), xstep_(xstep) {} | ||
|
||
private: | ||
accscalar_t xstart_; | ||
accscalar_t xstep_; | ||
}; | ||
|
||
Tensor& arange_kernel( | ||
const Scalar& start, | ||
const Scalar& end, | ||
const Scalar& step, | ||
Tensor& result) { | ||
AT_DISPATCH_ALL_TYPES_AND2( | ||
at::ScalarType::Half, | ||
at::ScalarType::BFloat16, | ||
result.scalar_type(), | ||
"arange_xpu", | ||
[&]() { | ||
using accscalar_t = at::acc_type<scalar_t, true>; | ||
auto xstart = start.to<accscalar_t>(); | ||
auto xend = end.to<accscalar_t>(); | ||
auto xstep = step.to<accscalar_t>(); | ||
|
||
TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); | ||
TORCH_CHECK( | ||
std::isfinite(static_cast<double>(xstart)) && | ||
std::isfinite(static_cast<double>(xend)), | ||
"unsupported range: ", | ||
xstart, | ||
" -> ", | ||
xend); | ||
TORCH_CHECK( | ||
((xstep > 0) && (xend >= xstart)) || | ||
((xstep < 0) && (xend <= xstart)), | ||
"upper bound and larger bound inconsistent with step sign"); | ||
|
||
// we use double precision for (start - end) / step | ||
// to compute size_d for consistency across devices. | ||
// The problem with using accscalar_t is that accscalar_t might be | ||
// float32 on gpu for a float32 scalar_t, but double on cpu for the | ||
// same, and the effective output size starts differing on CPU vs GPU | ||
// because of precision issues, which we dont want. the corner-case we | ||
// do want to take into account is int64_t, which has higher precision | ||
// than double | ||
double size_d; | ||
if constexpr (std::is_same_v<scalar_t, int64_t>) { | ||
int64_t sgn = (xstep > 0) - (xstep < 0); | ||
size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); | ||
} else { | ||
size_d = std::ceil( | ||
static_cast<double>(end.to<double>() - start.to<double>()) / | ||
step.to<double>()); | ||
} | ||
|
||
TORCH_CHECK( | ||
size_d >= 0 && | ||
size_d <= | ||
static_cast<double>(std::numeric_limits<int64_t>::max()), | ||
"invalid size, possible overflow?"); | ||
int64_t size = static_cast<int64_t>(size_d); | ||
int64_t numel = result.numel(); | ||
|
||
if (numel != size) { | ||
if (numel > 0) { | ||
TORCH_WARN( | ||
"The number of elements in the out tensor of shape ", | ||
result.sizes(), | ||
" is ", | ||
numel, | ||
" which does not match the computed number of elements ", | ||
size, | ||
". Note that this may occur as a result of rounding error. " | ||
"The out tensor will be resized to a tensor of shape (", | ||
size, | ||
",)."); | ||
} | ||
result.resize_({size}); | ||
} | ||
bool is_contiguous = result.is_contiguous(); | ||
Tensor r = !is_contiguous | ||
? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) | ||
: result; | ||
|
||
auto f = ArangeFunctor<scalar_t, accscalar_t>(xstart, xstep); | ||
gpu_kernel_with_index(r, f); | ||
|
||
if (!is_contiguous) { | ||
result.copy_(r); | ||
} | ||
}); | ||
|
||
return result; | ||
} | ||
|
||
} // namespace xpu | ||
} // namespace native | ||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#pragma once | ||
|
||
#include <ATen/native/TensorIterator.h> | ||
|
||
namespace at { | ||
namespace native { | ||
namespace xpu { | ||
|
||
Tensor& arange_kernel( | ||
const Scalar& start, | ||
const Scalar& end, | ||
const Scalar& step, | ||
Tensor& result); | ||
|
||
} // namespace xpu | ||
} // namespace native | ||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters