Skip to content

Commit

Permalink
Merge branch 'main' into penghuic/pytest_by_script
Browse files Browse the repository at this point in the history
  • Loading branch information
PenghuiCheng authored Dec 11, 2024
2 parents f496cb0 + 66ac930 commit ebc83be
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 99 deletions.
57 changes: 57 additions & 0 deletions src/ATen/native/xpu/DilatedMaxPool2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
#include <comm/RegisterUtils.h>

#include <xpu/ATen/ops/max.h>
#include <xpu/ATen/ops/max_pool2d_with_indices_backward_native.h>
#include <xpu/ATen/ops/max_pool2d_with_indices_native.h>

Expand Down Expand Up @@ -40,6 +41,62 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_xpu)
bool ceil_mode,
const Tensor& output,
const Tensor& indices) {
const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
const int kW = kernel_size.size() == 1
? kH
: safe_downcast<int, int64_t>(kernel_size[1]);
const int padH = safe_downcast<int, int64_t>(padding[0]);
const int padW =
padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);

const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
const int64_t nInputPlane = input.size(-3);
const int64_t inputHeight = input.size(-2);
const int64_t inputWidth = input.size(-1);

const int64_t outputHeight = output.size(-2);
const int64_t outputWidth = output.size(-1);
if (outputHeight == 1 && outputWidth == 1 && inputHeight <= kH &&
inputWidth <= kW && padH == 0 && padW == 0) {
auto smf = input.suggest_memory_format();
Tensor input_ = input.contiguous(smf);
bool is_3d = input.ndimension() == 3;
Tensor indices_, output_;
if (is_3d) {
indices_ = indices.contiguous();
output_ = output.contiguous();
} else {
indices_ = indices.contiguous(smf);
output_ = output.contiguous(smf);
}
if (!is_3d) {
input_.resize_({nbatch, nInputPlane, 1, inputHeight * inputWidth}, smf);
output_.resize_(
{nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
indices_.resize_(
{nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
at::max_outf(input_, 3, true, output_, indices_);
} else {
at::max_outf(input_, 2, true, output_, indices_);
}

if (!is_3d) {
input_.resize_({nbatch, nInputPlane, inputHeight, inputWidth}, smf);
output_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
indices_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
}

if ((is_3d && !indices.is_contiguous()) ||
(!is_3d && !indices.is_contiguous(smf))) {
indices.copy_(indices_);
}

if ((is_3d && !output.is_contiguous()) ||
(!is_3d && !output.is_contiguous(smf))) {
output.copy_(output_);
}
return;
}
xpu::max_pool2d_with_indices_kernel(
input,
kernel_size,
Expand Down
6 changes: 3 additions & 3 deletions src/ATen/native/xpu/RreluWithNoise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace native {

Tensor& rrelu_with_noise_out_xpu(
const Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand All @@ -18,7 +18,7 @@ Tensor& rrelu_with_noise_out_xpu(

Tensor rrelu_with_noise_xpu(
const Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand All @@ -30,7 +30,7 @@ Tensor rrelu_with_noise_xpu(

Tensor& rrelu_with_noise_xpu_(
Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand Down
141 changes: 51 additions & 90 deletions src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <ATen/Dispatch.h>
#include <ATen/native/Pool.h>
#include <ATen/native/utils/ParamUtils.h>
#include <xpu/ATen/ops/max.h>

#include <ATen/native/xpu/sycl/Atomics.h>
#include <ATen/native/xpu/sycl/BatchKernel.h>
Expand Down Expand Up @@ -542,96 +541,58 @@ void max_pool2d_with_indices_kernel(

const int64_t outputHeight = output.size(-2);
const int64_t outputWidth = output.size(-1);
if (outputHeight == 1 && outputWidth == 1 && inputHeight <= kH &&
inputWidth <= kW && padH == 0 && padW == 0) {
bool is_3d = input_.ndimension() == 3;
Tensor indices_, output_;
if (is_3d) {
indices_ = indices.contiguous();
output_ = output.contiguous();
} else {
indices_ = indices.contiguous(smf);
output_ = output.contiguous(smf);
}
if (!is_3d) {
input.resize_({nbatch, nInputPlane, 1, inputHeight * inputWidth}, smf);
output_.resize_(
{nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
indices_.resize_(
{nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
at::max_outf(input, 3, true, output_, indices_);
} else {
at::max_outf(input, 2, true, output_, indices_);
}

if (!is_3d) {
input.resize_({nbatch, nInputPlane, inputHeight, inputWidth}, smf);
output_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
indices_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
}

if ((is_3d && !indices.is_contiguous()) ||
(!is_3d && !indices.is_contiguous(smf))) {
indices.copy_(indices_);
}

if ((is_3d && !output.is_contiguous()) ||
(!is_3d && !output.is_contiguous(smf))) {
output.copy_(output_);
}
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "max_pool2d_xpu", [&] {
switch (smf) {
case MemoryFormat::ChannelsLast: {
launch_max_pool2d_kernel<scalar_t, true>(
output.mutable_data_ptr<scalar_t>(),
indices.mutable_data_ptr<int64_t>(),
input.const_data_ptr<scalar_t>(),
nbatch,
nInputPlane,
inputHeight,
inputWidth,
outputHeight,
outputWidth,
kH,
kW,
dH,
dW,
padH,
padW,
dilationH,
dilationW);
break;
}
case MemoryFormat::Contiguous: {
launch_max_pool2d_kernel<scalar_t, false>(
output.mutable_data_ptr<scalar_t>(),
indices.mutable_data_ptr<int64_t>(),
input.const_data_ptr<scalar_t>(),
nbatch,
nInputPlane,
inputHeight,
inputWidth,
outputHeight,
outputWidth,
kH,
kW,
dH,
dW,
padH,
padW,
dilationH,
dilationW);
break;
}
default:
TORCH_CHECK(
false,
"Unsupported memory format. Supports only ChannelsLast, Contiguous");

AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "max_pool2d_xpu", [&] {
switch (smf) {
case MemoryFormat::ChannelsLast: {
launch_max_pool2d_kernel<scalar_t, true>(
output.mutable_data_ptr<scalar_t>(),
indices.mutable_data_ptr<int64_t>(),
input.const_data_ptr<scalar_t>(),
nbatch,
nInputPlane,
inputHeight,
inputWidth,
outputHeight,
outputWidth,
kH,
kW,
dH,
dW,
padH,
padW,
dilationH,
dilationW);
break;
}
});
}
case MemoryFormat::Contiguous: {
launch_max_pool2d_kernel<scalar_t, false>(
output.mutable_data_ptr<scalar_t>(),
indices.mutable_data_ptr<int64_t>(),
input.const_data_ptr<scalar_t>(),
nbatch,
nInputPlane,
inputHeight,
inputWidth,
outputHeight,
outputWidth,
kH,
kW,
dH,
dW,
padH,
padW,
dilationH,
dilationW);
break;
}
default:
TORCH_CHECK(
false,
"Unsupported memory format. Supports only ChannelsLast, Contiguous");
}
});
}

void max_pool2d_with_indices_backward_kernel(
Expand Down
14 changes: 14 additions & 0 deletions src/ATen/native/xpu/sycl/LerpKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,29 @@ struct LerpScalarFunctor {
opmath_t weight_val_;
};

void lerp_scalar_kernel(
at::TensorIteratorBase& iter,
const c10::Scalar& weight);

void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
auto dtype = iter.common_dtype();
if (at::isComplexType(dtype)) {
AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_xpu", [&] {
if (iter.is_cpu_scalar(3)) {
auto weight_val = iter.scalar_value<scalar_t>(3);
iter.remove_operand(3);
return lerp_scalar_kernel(iter, weight_val);
}
gpu_kernel(iter, LerpTensorComplexFunctor<scalar_t>());
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "lerp_xpu", [&] {
if (iter.is_cpu_scalar(3)) {
auto weight_val = iter.scalar_value<scalar_t>(3);
iter.remove_operand(3);
return lerp_scalar_kernel(iter, weight_val);
}
gpu_kernel(iter, LerpTensorFunctor<scalar_t>());
});
}
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ template <typename scalar_t>
inline void _rrelu_with_noise_xpu_train(
Tensor& output,
const Tensor& input_,
const Tensor& noise_,
Tensor& noise_,
const Scalar& lower_,
const Scalar& upper_,
std::optional<Generator> generator) {
Expand Down Expand Up @@ -153,7 +153,7 @@ inline void _rrelu_with_noise_xpu_train(

Tensor& rrelu_with_noise_kernel(
const Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ namespace at::native::xpu {

TORCH_XPU_API Tensor& rrelu_with_noise_kernel(
const Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand Down
7 changes: 4 additions & 3 deletions yaml/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8184,25 +8184,26 @@
variants: function
tags: pointwise

- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
- func: rrelu_with_noise.out(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: nn
tags: nondeterministic_seeded
dispatch:
XPU: rrelu_with_noise_out_xpu

- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
- func: rrelu_with_noise(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
python_module: nn
dispatch:
XPU: rrelu_with_noise_xpu
tags: nondeterministic_seeded
autogen: rrelu_with_noise_functional

- func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
python_module: nn
dispatch:
CompositeExplicitAutograd: rrelu_with_noise_backward
autogen: rrelu_with_noise_backward.out

- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
- func: rrelu_with_noise_(Tensor(a!) self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
python_module: nn
tags: nondeterministic_seeded
dispatch:
Expand Down

0 comments on commit ebc83be

Please sign in to comment.