Merge branch 'main' into penghuic/pytest_by_script

intel · Dec 11, 2024 · ebc83be · ebc83be
2 parents f496cb0 + 66ac930
commit ebc83be
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 99 deletions.
diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
@@ -4,6 +4,7 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
 #include <comm/RegisterUtils.h>
 
+#include <xpu/ATen/ops/max.h>
 #include <xpu/ATen/ops/max_pool2d_with_indices_backward_native.h>
 #include <xpu/ATen/ops/max_pool2d_with_indices_native.h>
 
@@ -40,6 +41,62 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_xpu)
  bool ceil_mode,
  const Tensor& output,
  const Tensor& indices) {
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1
+      ? kH
+      : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW =
+      padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
+
+  const int64_t outputHeight = output.size(-2);
+  const int64_t outputWidth = output.size(-1);
+  if (outputHeight == 1 && outputWidth == 1 && inputHeight <= kH &&
+      inputWidth <= kW && padH == 0 && padW == 0) {
+    auto smf = input.suggest_memory_format();
+    Tensor input_ = input.contiguous(smf);
+    bool is_3d = input.ndimension() == 3;
+    Tensor indices_, output_;
+    if (is_3d) {
+      indices_ = indices.contiguous();
+      output_ = output.contiguous();
+    } else {
+      indices_ = indices.contiguous(smf);
+      output_ = output.contiguous(smf);
+    }
+    if (!is_3d) {
+      input_.resize_({nbatch, nInputPlane, 1, inputHeight * inputWidth}, smf);
+      output_.resize_(
+          {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
+      indices_.resize_(
+          {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
+      at::max_outf(input_, 3, true, output_, indices_);
+    } else {
+      at::max_outf(input_, 2, true, output_, indices_);
+    }
+
+    if (!is_3d) {
+      input_.resize_({nbatch, nInputPlane, inputHeight, inputWidth}, smf);
+      output_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
+      indices_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
+    }
+
+    if ((is_3d && !indices.is_contiguous()) ||
+        (!is_3d && !indices.is_contiguous(smf))) {
+      indices.copy_(indices_);
+    }
+
+    if ((is_3d && !output.is_contiguous()) ||
+        (!is_3d && !output.is_contiguous(smf))) {
+      output.copy_(output_);
+    }
+    return;
+  }
   xpu::max_pool2d_with_indices_kernel(
       input,
       kernel_size,

diff --git a/src/ATen/native/xpu/RreluWithNoise.cpp b/src/ATen/native/xpu/RreluWithNoise.cpp
@@ -6,7 +6,7 @@ namespace native {
 
 Tensor& rrelu_with_noise_out_xpu(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
@@ -18,7 +18,7 @@ Tensor& rrelu_with_noise_out_xpu(
 
 Tensor rrelu_with_noise_xpu(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
@@ -30,7 +30,7 @@ Tensor rrelu_with_noise_xpu(
 
 Tensor& rrelu_with_noise_xpu_(
     Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,

diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -8,7 +8,6 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/utils/ParamUtils.h>
-#include <xpu/ATen/ops/max.h>
 
 #include <ATen/native/xpu/sycl/Atomics.h>
 #include <ATen/native/xpu/sycl/BatchKernel.h>
@@ -542,96 +541,58 @@ void max_pool2d_with_indices_kernel(
 
   const int64_t outputHeight = output.size(-2);
   const int64_t outputWidth = output.size(-1);
-  if (outputHeight == 1 && outputWidth == 1 && inputHeight <= kH &&
-      inputWidth <= kW && padH == 0 && padW == 0) {
-    bool is_3d = input_.ndimension() == 3;
-    Tensor indices_, output_;
-    if (is_3d) {
-      indices_ = indices.contiguous();
-      output_ = output.contiguous();
-    } else {
-      indices_ = indices.contiguous(smf);
-      output_ = output.contiguous(smf);
-    }
-    if (!is_3d) {
-      input.resize_({nbatch, nInputPlane, 1, inputHeight * inputWidth}, smf);
-      output_.resize_(
-          {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
-      indices_.resize_(
-          {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
-      at::max_outf(input, 3, true, output_, indices_);
-    } else {
-      at::max_outf(input, 2, true, output_, indices_);
-    }
-
-    if (!is_3d) {
-      input.resize_({nbatch, nInputPlane, inputHeight, inputWidth}, smf);
-      output_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
-      indices_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
-    }
-
-    if ((is_3d && !indices.is_contiguous()) ||
-        (!is_3d && !indices.is_contiguous(smf))) {
-      indices.copy_(indices_);
-    }
-
-    if ((is_3d && !output.is_contiguous()) ||
-        (!is_3d && !output.is_contiguous(smf))) {
-      output.copy_(output_);
-    }
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
-        kHalf, kBFloat16, input.scalar_type(), "max_pool2d_xpu", [&] {
-          switch (smf) {
-            case MemoryFormat::ChannelsLast: {
-              launch_max_pool2d_kernel<scalar_t, true>(
-                  output.mutable_data_ptr<scalar_t>(),
-                  indices.mutable_data_ptr<int64_t>(),
-                  input.const_data_ptr<scalar_t>(),
-                  nbatch,
-                  nInputPlane,
-                  inputHeight,
-                  inputWidth,
-                  outputHeight,
-                  outputWidth,
-                  kH,
-                  kW,
-                  dH,
-                  dW,
-                  padH,
-                  padW,
-                  dilationH,
-                  dilationW);
-              break;
-            }
-            case MemoryFormat::Contiguous: {
-              launch_max_pool2d_kernel<scalar_t, false>(
-                  output.mutable_data_ptr<scalar_t>(),
-                  indices.mutable_data_ptr<int64_t>(),
-                  input.const_data_ptr<scalar_t>(),
-                  nbatch,
-                  nInputPlane,
-                  inputHeight,
-                  inputWidth,
-                  outputHeight,
-                  outputWidth,
-                  kH,
-                  kW,
-                  dH,
-                  dW,
-                  padH,
-                  padW,
-                  dilationH,
-                  dilationW);
-              break;
-            }
-            default:
-              TORCH_CHECK(
-                  false,
-                  "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "max_pool2d_xpu", [&] {
+        switch (smf) {
+          case MemoryFormat::ChannelsLast: {
+            launch_max_pool2d_kernel<scalar_t, true>(
+                output.mutable_data_ptr<scalar_t>(),
+                indices.mutable_data_ptr<int64_t>(),
+                input.const_data_ptr<scalar_t>(),
+                nbatch,
+                nInputPlane,
+                inputHeight,
+                inputWidth,
+                outputHeight,
+                outputWidth,
+                kH,
+                kW,
+                dH,
+                dW,
+                padH,
+                padW,
+                dilationH,
+                dilationW);
+            break;
           }
-        });
-  }
+          case MemoryFormat::Contiguous: {
+            launch_max_pool2d_kernel<scalar_t, false>(
+                output.mutable_data_ptr<scalar_t>(),
+                indices.mutable_data_ptr<int64_t>(),
+                input.const_data_ptr<scalar_t>(),
+                nbatch,
+                nInputPlane,
+                inputHeight,
+                inputWidth,
+                outputHeight,
+                outputWidth,
+                kH,
+                kW,
+                dH,
+                dW,
+                padH,
+                padW,
+                dilationH,
+                dilationW);
+            break;
+          }
+          default:
+            TORCH_CHECK(
+                false,
+                "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+        }
+      });
 }
 
 void max_pool2d_with_indices_backward_kernel(

diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp
@@ -57,15 +57,29 @@ struct LerpScalarFunctor {
   opmath_t weight_val_;
 };
 
+void lerp_scalar_kernel(
+    at::TensorIteratorBase& iter,
+    const c10::Scalar& weight);
+
 void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
     AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_xpu", [&] {
+      if (iter.is_cpu_scalar(3)) {
+        auto weight_val = iter.scalar_value<scalar_t>(3);
+        iter.remove_operand(3);
+        return lerp_scalar_kernel(iter, weight_val);
+      }
       gpu_kernel(iter, LerpTensorComplexFunctor<scalar_t>());
     });
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND2(
         at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "lerp_xpu", [&] {
+          if (iter.is_cpu_scalar(3)) {
+            auto weight_val = iter.scalar_value<scalar_t>(3);
+            iter.remove_operand(3);
+            return lerp_scalar_kernel(iter, weight_val);
+          }
           gpu_kernel(iter, LerpTensorFunctor<scalar_t>());
         });
   }

diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
@@ -86,7 +86,7 @@ template <typename scalar_t>
 inline void _rrelu_with_noise_xpu_train(
     Tensor& output,
     const Tensor& input_,
-    const Tensor& noise_,
+    Tensor& noise_,
     const Scalar& lower_,
     const Scalar& upper_,
     std::optional<Generator> generator) {
@@ -153,7 +153,7 @@ inline void _rrelu_with_noise_xpu_train(
 
 Tensor& rrelu_with_noise_kernel(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,

diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
@@ -7,7 +7,7 @@ namespace at::native::xpu {
 
 TORCH_XPU_API Tensor& rrelu_with_noise_kernel(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,

diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
@@ -8184,25 +8184,26 @@
   variants: function
   tags: pointwise
 
-- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: rrelu_with_noise.out(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   tags: nondeterministic_seeded
   dispatch:
     XPU: rrelu_with_noise_out_xpu
 
-- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+- func: rrelu_with_noise(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
   python_module: nn
   dispatch:
     XPU: rrelu_with_noise_xpu
   tags: nondeterministic_seeded
+  autogen: rrelu_with_noise_functional
 
 - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
   python_module: nn
   dispatch:
     CompositeExplicitAutograd: rrelu_with_noise_backward
   autogen: rrelu_with_noise_backward.out
 
-- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+- func: rrelu_with_noise_(Tensor(a!) self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
   python_module: nn
   tags: nondeterministic_seeded
   dispatch: