-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Complement part of activation ops (#49)
e.g. relu, threshold, threshold_backward, gelu, gelu_backward, tanh_backward --------- Signed-off-by: Feng Yuan <[email protected]> Co-authored-by: Feng Yuan <[email protected]>
- Loading branch information
1 parent
e755b1e
commit 0e1beb7
Showing
14 changed files
with
495 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
*/*.pyc | ||
*/*.so* | ||
*/**/__pycache__ | ||
*/**/*.dylib* | ||
*/**/*.pyc | ||
*/**/*.pyd | ||
*/**/*.so* | ||
*/**/**/*.pyc | ||
*/**/**/**/*.pyc | ||
*/**/**/**/**/*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#include <ATen/ScalarOps.h> | ||
#include <ATen/XPUNativeFunctions.h> | ||
#include <ATen/core/Tensor.h> | ||
#include <ATen/native/TensorIterator.h> | ||
|
||
#include <aten/sycl/ActivationGeluKernel.h> | ||
#include <aten/sycl/ActivationOpsKernels.h> | ||
#include <aten/sycl/ActivationThresholdKernel.h> | ||
|
||
namespace at { | ||
|
||
Tensor XPUNativeFunctions::relu(const Tensor& self) { | ||
Tensor out; | ||
auto iter = TensorIterator::unary_op(out, self); | ||
native::xpu::relu_kernel(iter); | ||
return iter.output(); | ||
} | ||
|
||
Tensor& XPUNativeFunctions::relu_(Tensor& self) { | ||
auto iter = TensorIterator::unary_op(self, self); | ||
native::xpu::relu_kernel(iter); | ||
return self; | ||
} | ||
|
||
Tensor& XPUNativeFunctions::relu_out(const Tensor& self, Tensor& out) { | ||
auto iter = TensorIterator::unary_op(out, self); | ||
native::xpu::relu_kernel(iter); | ||
return out; | ||
} | ||
|
||
Tensor XPUNativeFunctions::threshold( | ||
const Tensor& self, | ||
const Scalar& threshold, | ||
const Scalar& value) { | ||
Tensor out; | ||
auto iter = TensorIterator::binary_op(out, self, self); | ||
native::xpu::threshold_kernel(iter, threshold, value); | ||
return iter.output(); | ||
} | ||
|
||
Tensor& XPUNativeFunctions::threshold_( | ||
Tensor& self, | ||
const Scalar& threshold, | ||
const Scalar& value) { | ||
auto iter = TensorIterator::binary_op(self, self, self); | ||
native::xpu::threshold_kernel(iter, threshold, value); | ||
return self; | ||
} | ||
|
||
Tensor& XPUNativeFunctions::threshold_out( | ||
const Tensor& self, | ||
const Scalar& threshold, | ||
const Scalar& value, | ||
Tensor& out) { | ||
auto iter = TensorIterator::binary_op(out, self, self); | ||
native::xpu::threshold_kernel(iter, threshold, value); | ||
return out; | ||
} | ||
|
||
Tensor XPUNativeFunctions::threshold_backward( | ||
const Tensor& grad_output, | ||
const Tensor& self, | ||
const Scalar& threshold) { | ||
Tensor grad_input; | ||
auto iter = TensorIterator::binary_op(grad_input, self, grad_output); | ||
native::xpu::threshold_kernel(iter, threshold, 0); | ||
return iter.output(); | ||
} | ||
|
||
Tensor& XPUNativeFunctions::threshold_backward_out( | ||
const Tensor& grad_output, | ||
const Tensor& self, | ||
const Scalar& threshold, | ||
Tensor& grad_input) { | ||
auto iter = TensorIterator::binary_op(grad_input, self, grad_output); | ||
native::xpu::threshold_kernel(iter, threshold, 0); | ||
return grad_input; | ||
} | ||
|
||
Tensor XPUNativeFunctions::gelu( | ||
const Tensor& self, | ||
c10::string_view approximate) { | ||
Tensor out; | ||
auto iter = TensorIterator::unary_op(out, self); | ||
native::xpu::gelu_kernel(iter, approximate); | ||
return iter.output(); | ||
} | ||
|
||
Tensor& XPUNativeFunctions::gelu_(Tensor& self, c10::string_view approximate) { | ||
auto iter = TensorIterator::unary_op(self, self); | ||
native::xpu::gelu_kernel(iter, approximate); | ||
return self; | ||
} | ||
|
||
Tensor& XPUNativeFunctions::gelu_out( | ||
const Tensor& self, | ||
c10::string_view approximate, | ||
Tensor& out) { | ||
auto iter = TensorIterator::unary_op(out, self); | ||
native::xpu::gelu_kernel(iter, approximate); | ||
return out; | ||
} | ||
|
||
Tensor XPUNativeFunctions::gelu_backward( | ||
const Tensor& grad_output, | ||
const Tensor& self, | ||
c10::string_view approximate) { | ||
Tensor grad_input; | ||
auto iter = TensorIterator::binary_op(grad_input, grad_output, self); | ||
native::xpu::gelu_backward_kernel(iter, approximate); | ||
return iter.output(); | ||
} | ||
|
||
Tensor& XPUNativeFunctions::gelu_backward_out( | ||
const Tensor& grad_output, | ||
const Tensor& self, | ||
c10::string_view approximate, | ||
Tensor& grad_input) { | ||
auto iter = TensorIterator::binary_op(grad_input, grad_output, self); | ||
native::xpu::gelu_backward_kernel(iter, approximate); | ||
return grad_input; | ||
} | ||
|
||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
#include <ATen/ATen.h> | ||
#include <ATen/Dispatch.h> | ||
#include <ATen/NumericUtils.h> | ||
#include <ATen/native/Activation.h> | ||
#include <ATen/native/TensorIterator.h> | ||
#include <aten/sycl/Loops.h> | ||
#include <comm/XPUMathCompat.h> | ||
|
||
namespace at { | ||
namespace native { | ||
namespace xpu { | ||
|
||
template <typename scalar_t> | ||
struct GeluTanhFunctor { | ||
scalar_t operator()(scalar_t x) const { | ||
using opmath_t = at::opmath_type<scalar_t>; | ||
constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5); | ||
constexpr opmath_t kKappa = 0.044715; | ||
auto x_cube = static_cast<opmath_t>(x) * static_cast<opmath_t>(x) * | ||
static_cast<opmath_t>(x); | ||
auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube); | ||
return opmath_t(0.5) * static_cast<opmath_t>(x) * | ||
(opmath_t(1) + c10::xpu::compat::tanh(inner)); | ||
} | ||
}; | ||
|
||
template <typename scalar_t> | ||
struct GeluTanhBackwardFunctor { | ||
scalar_t operator()(scalar_t dy, scalar_t x) const { | ||
using opmath_t = at::opmath_type<scalar_t>; | ||
constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5); | ||
constexpr opmath_t kKappa = 0.044715; | ||
auto x_sq = static_cast<opmath_t>(x) * static_cast<opmath_t>(x); | ||
auto x_cube = x_sq * static_cast<opmath_t>(x); | ||
auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube); | ||
auto tanh_inner = c10::xpu::compat::tanh(inner); | ||
|
||
auto left = opmath_t(0.5) * static_cast<opmath_t>(x); | ||
auto right = opmath_t(1) + tanh_inner; | ||
|
||
auto left_derivative = opmath_t(0.5) * right; | ||
|
||
auto tanh_derivative = opmath_t(1) - tanh_inner * tanh_inner; | ||
auto inner_derivative = kBeta * (opmath_t(1) + opmath_t(3) * kKappa * x_sq); | ||
auto right_derivative = left * tanh_derivative * inner_derivative; | ||
|
||
return static_cast<opmath_t>(dy) * (left_derivative + right_derivative); | ||
} | ||
}; | ||
|
||
template <typename scalar_t> | ||
struct GeluErfFunctor { | ||
scalar_t operator()(scalar_t x) const { | ||
using opmath_t = at::opmath_type<scalar_t>; | ||
constexpr opmath_t kAlpha = M_SQRT1_2; | ||
return static_cast<opmath_t>(x) * opmath_t(0.5) * | ||
(opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha)); | ||
} | ||
}; | ||
|
||
template <typename scalar_t> | ||
struct GeluErfBackwardFunctor { | ||
scalar_t operator()(scalar_t dy, scalar_t x) const { | ||
using opmath_t = at::opmath_type<scalar_t>; | ||
constexpr opmath_t kBeta = M_2_SQRTPI * M_SQRT1_2 * opmath_t(0.5); | ||
constexpr opmath_t kAlpha = M_SQRT1_2; | ||
const opmath_t cdf = opmath_t(0.5) * | ||
(opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha)); | ||
const opmath_t pdf = c10::xpu::compat::exp( | ||
opmath_t(-0.5) * static_cast<opmath_t>(x) * | ||
static_cast<opmath_t>(x)) * | ||
kBeta; | ||
return static_cast<opmath_t>(dy) * (cdf + static_cast<opmath_t>(x) * pdf); | ||
} | ||
}; | ||
|
||
void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate) { | ||
auto approximate_ = at::native::get_gelutype_enum(approximate); | ||
if (approximate_ == at::native::GeluType::Tanh) { | ||
AT_DISPATCH_FLOATING_TYPES_AND2( | ||
at::ScalarType::BFloat16, | ||
at::ScalarType::Half, | ||
iter.dtype(), | ||
"gelu_tanh_xpu", | ||
[&]() { gpu_kernel(iter, GeluTanhFunctor<scalar_t>()); }); | ||
} else { | ||
AT_DISPATCH_FLOATING_TYPES_AND2( | ||
at::ScalarType::BFloat16, | ||
at::ScalarType::Half, | ||
iter.dtype(), | ||
"gelu_erf_xpu", | ||
[&]() { gpu_kernel(iter, GeluErfFunctor<scalar_t>()); }); | ||
} | ||
} | ||
|
||
void gelu_backward_kernel( | ||
TensorIteratorBase& iter, | ||
c10::string_view approximate) { | ||
auto approximate_ = at::native::get_gelutype_enum(approximate); | ||
if (approximate_ == at::native::GeluType::Tanh) { | ||
AT_DISPATCH_FLOATING_TYPES_AND2( | ||
at::ScalarType::BFloat16, | ||
at::ScalarType::Half, | ||
iter.dtype(), | ||
"gelu_tanh_backward_xpu", | ||
[&]() { | ||
gpu_kernel_with_scalars(iter, GeluTanhBackwardFunctor<scalar_t>()); | ||
}); | ||
} else { | ||
AT_DISPATCH_FLOATING_TYPES_AND2( | ||
at::ScalarType::BFloat16, | ||
at::ScalarType::Half, | ||
iter.dtype(), | ||
"gelu_erf_backward_xpu", | ||
[&]() { | ||
gpu_kernel_with_scalars(iter, GeluErfBackwardFunctor<scalar_t>()); | ||
}); | ||
} | ||
} | ||
|
||
} // namespace xpu | ||
} // namespace native | ||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#pragma once | ||
|
||
#include <ATen/native/TensorIterator.h> | ||
|
||
namespace at { | ||
namespace native { | ||
namespace xpu { | ||
|
||
void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate); | ||
|
||
void gelu_backward_kernel( | ||
TensorIteratorBase& iter, | ||
c10::string_view approximate); | ||
|
||
} // namespace xpu | ||
} // namespace native | ||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#include <ATen/ATen.h> | ||
#include <ATen/Dispatch.h> | ||
#include <ATen/NumericUtils.h> | ||
#include <ATen/native/Activation.h> | ||
#include <ATen/native/TensorIterator.h> | ||
#include <aten/sycl/Loops.h> | ||
#include <comm/XPUMathCompat.h> | ||
|
||
namespace at { | ||
namespace native { | ||
namespace xpu { | ||
|
||
template <typename scalar_t> | ||
struct ReluFunctor { | ||
scalar_t operator()(scalar_t x) const { | ||
return x <= scalar_t{0} ? scalar_t{0} : x; | ||
} | ||
}; | ||
|
||
void relu_kernel(TensorIteratorBase& iter) { | ||
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "relu_xpu", [&]() { | ||
gpu_kernel(iter, ReluFunctor<scalar_t>()); | ||
}); | ||
} | ||
|
||
} // namespace xpu | ||
} // namespace native | ||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#pragma once | ||
|
||
#include <ATen/native/TensorIterator.h> | ||
|
||
namespace at { | ||
namespace native { | ||
namespace xpu { | ||
|
||
void relu_kernel(TensorIteratorBase& iter); | ||
|
||
} // namespace xpu | ||
} // namespace native | ||
} // namespace at |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#include <ATen/ATen.h> | ||
#include <ATen/Dispatch.h> | ||
#include <ATen/NumericUtils.h> | ||
#include <ATen/native/Activation.h> | ||
#include <ATen/native/TensorIterator.h> | ||
#include <aten/sycl/Loops.h> | ||
|
||
namespace at { | ||
namespace native { | ||
namespace xpu { | ||
|
||
template <typename scalar_t> | ||
struct ThresholdFunctor { | ||
scalar_t operator()(scalar_t x, scalar_t other) const { | ||
return x <= threshold_ ? value_ : other; | ||
} | ||
|
||
ThresholdFunctor(scalar_t threshold, scalar_t value) | ||
: threshold_(threshold), value_(value) {} | ||
|
||
private: | ||
scalar_t threshold_; | ||
scalar_t value_; | ||
}; | ||
|
||
void threshold_kernel( | ||
TensorIteratorBase& iter, | ||
const Scalar& threshold, | ||
const Scalar& value) { | ||
AT_DISPATCH_ALL_TYPES_AND2( | ||
kHalf, kBFloat16, iter.dtype(), "threshold_xpu", [&]() { | ||
scalar_t threshold_ = threshold.to<scalar_t>(); | ||
scalar_t value_ = value.to<scalar_t>(); | ||
gpu_kernel_with_scalars( | ||
iter, ThresholdFunctor<scalar_t>(threshold_, value_)); | ||
}); | ||
} | ||
|
||
} // namespace xpu | ||
} // namespace native | ||
} // namespace at |
Oops, something went wrong.