Skip to content

Commit

Permalink
Complement part of activation ops (#49)
Browse files Browse the repository at this point in the history
e.g. relu, threshold, threshold_backward, gelu, gelu_backward,
tanh_backward

---------

Signed-off-by: Feng Yuan <[email protected]>
Co-authored-by: Feng Yuan <[email protected]>
  • Loading branch information
xytintel and fengyuan14 authored Mar 25, 2024
1 parent e755b1e commit 0e1beb7
Show file tree
Hide file tree
Showing 14 changed files with 495 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
*/*.pyc
*/*.so*
*/**/__pycache__
*/**/*.dylib*
*/**/*.pyc
*/**/*.pyd
*/**/*.so*
*/**/**/*.pyc
*/**/**/**/*.pyc
*/**/**/**/**/*.pyc
124 changes: 124 additions & 0 deletions src/aten/Activation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#include <ATen/ScalarOps.h>
#include <ATen/XPUNativeFunctions.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/TensorIterator.h>

#include <aten/sycl/ActivationGeluKernel.h>
#include <aten/sycl/ActivationOpsKernels.h>
#include <aten/sycl/ActivationThresholdKernel.h>

namespace at {

Tensor XPUNativeFunctions::relu(const Tensor& self) {
Tensor out;
auto iter = TensorIterator::unary_op(out, self);
native::xpu::relu_kernel(iter);
return iter.output();
}

Tensor& XPUNativeFunctions::relu_(Tensor& self) {
auto iter = TensorIterator::unary_op(self, self);
native::xpu::relu_kernel(iter);
return self;
}

Tensor& XPUNativeFunctions::relu_out(const Tensor& self, Tensor& out) {
auto iter = TensorIterator::unary_op(out, self);
native::xpu::relu_kernel(iter);
return out;
}

Tensor XPUNativeFunctions::threshold(
const Tensor& self,
const Scalar& threshold,
const Scalar& value) {
Tensor out;
auto iter = TensorIterator::binary_op(out, self, self);
native::xpu::threshold_kernel(iter, threshold, value);
return iter.output();
}

Tensor& XPUNativeFunctions::threshold_(
Tensor& self,
const Scalar& threshold,
const Scalar& value) {
auto iter = TensorIterator::binary_op(self, self, self);
native::xpu::threshold_kernel(iter, threshold, value);
return self;
}

Tensor& XPUNativeFunctions::threshold_out(
const Tensor& self,
const Scalar& threshold,
const Scalar& value,
Tensor& out) {
auto iter = TensorIterator::binary_op(out, self, self);
native::xpu::threshold_kernel(iter, threshold, value);
return out;
}

Tensor XPUNativeFunctions::threshold_backward(
const Tensor& grad_output,
const Tensor& self,
const Scalar& threshold) {
Tensor grad_input;
auto iter = TensorIterator::binary_op(grad_input, self, grad_output);
native::xpu::threshold_kernel(iter, threshold, 0);
return iter.output();
}

Tensor& XPUNativeFunctions::threshold_backward_out(
const Tensor& grad_output,
const Tensor& self,
const Scalar& threshold,
Tensor& grad_input) {
auto iter = TensorIterator::binary_op(grad_input, self, grad_output);
native::xpu::threshold_kernel(iter, threshold, 0);
return grad_input;
}

Tensor XPUNativeFunctions::gelu(
const Tensor& self,
c10::string_view approximate) {
Tensor out;
auto iter = TensorIterator::unary_op(out, self);
native::xpu::gelu_kernel(iter, approximate);
return iter.output();
}

Tensor& XPUNativeFunctions::gelu_(Tensor& self, c10::string_view approximate) {
auto iter = TensorIterator::unary_op(self, self);
native::xpu::gelu_kernel(iter, approximate);
return self;
}

Tensor& XPUNativeFunctions::gelu_out(
const Tensor& self,
c10::string_view approximate,
Tensor& out) {
auto iter = TensorIterator::unary_op(out, self);
native::xpu::gelu_kernel(iter, approximate);
return out;
}

Tensor XPUNativeFunctions::gelu_backward(
const Tensor& grad_output,
const Tensor& self,
c10::string_view approximate) {
Tensor grad_input;
auto iter = TensorIterator::binary_op(grad_input, grad_output, self);
native::xpu::gelu_backward_kernel(iter, approximate);
return iter.output();
}

Tensor& XPUNativeFunctions::gelu_backward_out(
const Tensor& grad_output,
const Tensor& self,
c10::string_view approximate,
Tensor& grad_input) {
auto iter = TensorIterator::binary_op(grad_input, grad_output, self);
native::xpu::gelu_backward_kernel(iter, approximate);
return grad_input;
}

} // namespace at
19 changes: 19 additions & 0 deletions src/aten/BinaryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <ATen/native/TensorIterator.h>

#include <aten/sycl/BinaryKernels.h>
#include <aten/sycl/BinaryMiscBackwardOpsKernels.h>
#include <aten/sycl/BinaryRemainderKernel.h>

namespace at {
Expand Down Expand Up @@ -331,4 +332,22 @@ Tensor& XPUNativeFunctions::fmod_out(
return XPUNativeFunctions::fmod_out(self, wrapper, out);
}

Tensor XPUNativeFunctions::tanh_backward(
const Tensor& grad_output,
const Tensor& output) {
Tensor out;
auto iter = TensorIterator::binary_op(out, grad_output, output);
native::xpu::tanh_backward_kernel(iter);
return iter.output();
}

Tensor& XPUNativeFunctions::tanh_backward_out(
const Tensor& grad_output,
const Tensor& output,
Tensor& grad_input) {
auto iter = TensorIterator::binary_op(grad_input, grad_output, output);
native::xpu::tanh_backward_kernel(iter);
return grad_input;
}

} // namespace at
123 changes: 123 additions & 0 deletions src/aten/sycl/ActivationGeluKernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/NumericUtils.h>
#include <ATen/native/Activation.h>
#include <ATen/native/TensorIterator.h>
#include <aten/sycl/Loops.h>
#include <comm/XPUMathCompat.h>

namespace at {
namespace native {
namespace xpu {

template <typename scalar_t>
struct GeluTanhFunctor {
scalar_t operator()(scalar_t x) const {
using opmath_t = at::opmath_type<scalar_t>;
constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5);
constexpr opmath_t kKappa = 0.044715;
auto x_cube = static_cast<opmath_t>(x) * static_cast<opmath_t>(x) *
static_cast<opmath_t>(x);
auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube);
return opmath_t(0.5) * static_cast<opmath_t>(x) *
(opmath_t(1) + c10::xpu::compat::tanh(inner));
}
};

template <typename scalar_t>
struct GeluTanhBackwardFunctor {
scalar_t operator()(scalar_t dy, scalar_t x) const {
using opmath_t = at::opmath_type<scalar_t>;
constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5);
constexpr opmath_t kKappa = 0.044715;
auto x_sq = static_cast<opmath_t>(x) * static_cast<opmath_t>(x);
auto x_cube = x_sq * static_cast<opmath_t>(x);
auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube);
auto tanh_inner = c10::xpu::compat::tanh(inner);

auto left = opmath_t(0.5) * static_cast<opmath_t>(x);
auto right = opmath_t(1) + tanh_inner;

auto left_derivative = opmath_t(0.5) * right;

auto tanh_derivative = opmath_t(1) - tanh_inner * tanh_inner;
auto inner_derivative = kBeta * (opmath_t(1) + opmath_t(3) * kKappa * x_sq);
auto right_derivative = left * tanh_derivative * inner_derivative;

return static_cast<opmath_t>(dy) * (left_derivative + right_derivative);
}
};

template <typename scalar_t>
struct GeluErfFunctor {
scalar_t operator()(scalar_t x) const {
using opmath_t = at::opmath_type<scalar_t>;
constexpr opmath_t kAlpha = M_SQRT1_2;
return static_cast<opmath_t>(x) * opmath_t(0.5) *
(opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
}
};

template <typename scalar_t>
struct GeluErfBackwardFunctor {
scalar_t operator()(scalar_t dy, scalar_t x) const {
using opmath_t = at::opmath_type<scalar_t>;
constexpr opmath_t kBeta = M_2_SQRTPI * M_SQRT1_2 * opmath_t(0.5);
constexpr opmath_t kAlpha = M_SQRT1_2;
const opmath_t cdf = opmath_t(0.5) *
(opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
const opmath_t pdf = c10::xpu::compat::exp(
opmath_t(-0.5) * static_cast<opmath_t>(x) *
static_cast<opmath_t>(x)) *
kBeta;
return static_cast<opmath_t>(dy) * (cdf + static_cast<opmath_t>(x) * pdf);
}
};

void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate) {
auto approximate_ = at::native::get_gelutype_enum(approximate);
if (approximate_ == at::native::GeluType::Tanh) {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::BFloat16,
at::ScalarType::Half,
iter.dtype(),
"gelu_tanh_xpu",
[&]() { gpu_kernel(iter, GeluTanhFunctor<scalar_t>()); });
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::BFloat16,
at::ScalarType::Half,
iter.dtype(),
"gelu_erf_xpu",
[&]() { gpu_kernel(iter, GeluErfFunctor<scalar_t>()); });
}
}

void gelu_backward_kernel(
TensorIteratorBase& iter,
c10::string_view approximate) {
auto approximate_ = at::native::get_gelutype_enum(approximate);
if (approximate_ == at::native::GeluType::Tanh) {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::BFloat16,
at::ScalarType::Half,
iter.dtype(),
"gelu_tanh_backward_xpu",
[&]() {
gpu_kernel_with_scalars(iter, GeluTanhBackwardFunctor<scalar_t>());
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::BFloat16,
at::ScalarType::Half,
iter.dtype(),
"gelu_erf_backward_xpu",
[&]() {
gpu_kernel_with_scalars(iter, GeluErfBackwardFunctor<scalar_t>());
});
}
}

} // namespace xpu
} // namespace native
} // namespace at
17 changes: 17 additions & 0 deletions src/aten/sycl/ActivationGeluKernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include <ATen/native/TensorIterator.h>

namespace at {
namespace native {
namespace xpu {

void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate);

void gelu_backward_kernel(
TensorIteratorBase& iter,
c10::string_view approximate);

} // namespace xpu
} // namespace native
} // namespace at
28 changes: 28 additions & 0 deletions src/aten/sycl/ActivationOpsKernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/NumericUtils.h>
#include <ATen/native/Activation.h>
#include <ATen/native/TensorIterator.h>
#include <aten/sycl/Loops.h>
#include <comm/XPUMathCompat.h>

namespace at {
namespace native {
namespace xpu {

template <typename scalar_t>
struct ReluFunctor {
scalar_t operator()(scalar_t x) const {
return x <= scalar_t{0} ? scalar_t{0} : x;
}
};

void relu_kernel(TensorIteratorBase& iter) {
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "relu_xpu", [&]() {
gpu_kernel(iter, ReluFunctor<scalar_t>());
});
}

} // namespace xpu
} // namespace native
} // namespace at
13 changes: 13 additions & 0 deletions src/aten/sycl/ActivationOpsKernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <ATen/native/TensorIterator.h>

namespace at {
namespace native {
namespace xpu {

void relu_kernel(TensorIteratorBase& iter);

} // namespace xpu
} // namespace native
} // namespace at
41 changes: 41 additions & 0 deletions src/aten/sycl/ActivationThresholdKernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/NumericUtils.h>
#include <ATen/native/Activation.h>
#include <ATen/native/TensorIterator.h>
#include <aten/sycl/Loops.h>

namespace at {
namespace native {
namespace xpu {

template <typename scalar_t>
struct ThresholdFunctor {
scalar_t operator()(scalar_t x, scalar_t other) const {
return x <= threshold_ ? value_ : other;
}

ThresholdFunctor(scalar_t threshold, scalar_t value)
: threshold_(threshold), value_(value) {}

private:
scalar_t threshold_;
scalar_t value_;
};

void threshold_kernel(
TensorIteratorBase& iter,
const Scalar& threshold,
const Scalar& value) {
AT_DISPATCH_ALL_TYPES_AND2(
kHalf, kBFloat16, iter.dtype(), "threshold_xpu", [&]() {
scalar_t threshold_ = threshold.to<scalar_t>();
scalar_t value_ = value.to<scalar_t>();
gpu_kernel_with_scalars(
iter, ThresholdFunctor<scalar_t>(threshold_, value_));
});
}

} // namespace xpu
} // namespace native
} // namespace at
Loading

0 comments on commit 0e1beb7

Please sign in to comment.