Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

linear_int4_kernel for XPU #1130

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1e32bbc
Sync main into release/2.6 branch (#1117)
xytintel Nov 22, 2024
f312190
[Release-2.6] Fix bugs of `empty_xpu` and `soft_shrink` (#1139)
xytintel Dec 3, 2024
7ecb0b1
[Release-2.6] Capture rrelu_with_noise noise mutation in compile (#1145)
xytintel Dec 5, 2024
5410f51
contiguous layout for sycl int4 kernel
airMeng Nov 22, 2024
e9311a3
push without compile
sunjiweiswift Nov 26, 2024
e3eaffa
update linearkernel
sunjiweiswift Nov 28, 2024
2a664af
fix some comiple error(not all)
sunjiweiswift Nov 28, 2024
0156ba5
add sycl_ker_config_convention
sunjiweiswift Nov 28, 2024
a58afec
reg kernel for pytorch
sunjiweiswift Nov 29, 2024
f487b20
add yaml for int4mm
sunjiweiswift Nov 29, 2024
ce1c894
update yaml file
sunjiweiswift Dec 3, 2024
d61b198
Modified some review comments
sunjiweiswift Dec 3, 2024
d76a0ce
modify fun name
sunjiweiswift Dec 9, 2024
870a3b5
autogen: _weight_int4pack_mm_with_scales_and_zeros.out
sunjiweiswift Dec 10, 2024
a9627f6
param int->int64_t(python int is int64)
sunjiweiswift Dec 10, 2024
952ead9
use AT_DISPATCH_FLOATING_TYPES_AND
sunjiweiswift Dec 10, 2024
93804f9
Keep the same name as pytorch's _weight_int4pack_mm
sunjiweiswift Dec 11, 2024
9e50b68
modify UT for int4
sunjiweiswift Dec 11, 2024
81a72f1
sync UT with pytoch UT(linalg)
sunjiweiswift Dec 12, 2024
a70df0a
col-major
sunjiweiswift Dec 12, 2024
c08382c
UT pass for B ones
sunjiweiswift Dec 13, 2024
14bb4e0
update gemv
sunjiweiswift Dec 16, 2024
70a3e13
fix scale and zp address
sunjiweiswift Dec 17, 2024
a590ad6
fix K large than 1024 UT
sunjiweiswift Dec 18, 2024
d6a2f3a
bug fix for FP16(BF16 maybe incorrect)
sunjiweiswift Dec 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/scripts/apply_torch_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
"https://github.com/pytorch/pytorch/pull/126516",
# Modify the tolerance level in TIMM benchmark
"https://github.com/pytorch/pytorch/pull/129735",
# [XPU] Update XPU C Shim Header
"https://github.com/pytorch/pytorch/pull/141086",
]
)
parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/env.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
source /opt/intel/oneapi/compiler/latest/env/vars.sh
source /opt/intel/oneapi/umf/latest/env/vars.sh
source /opt/intel/oneapi/pti/latest/env/vars.sh
2 changes: 1 addition & 1 deletion .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ jobs:
run: |
source activate xpu_op_${ZE_AFFINITY_MASK}
source .github/scripts/env.sh
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
cd ../pytorch
if [[ ${{ inputs.abi }} == '0' ]]; then
export _GLIBCXX_USE_CXX11_ABI=0
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_ondemand.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ jobs:
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_ondemand_rolling.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ jobs:
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_ondemand_whl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
Expand Down
2 changes: 1 addition & 1 deletion cmake/BuildFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")

if(WIN32)
set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h")
set(AOT_TARGETS "ats-m150,mtl-u,mtl-h,xe2-lpg,xe2-hpg")
else()
set(AOT_TARGETS "pvc,xe-lpg,ats-m150")
endif()
Expand Down
63 changes: 63 additions & 0 deletions src/ATen/native/xpu/LinearInt4.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@

#include <ATen/core/op_registration/adaption.h>
#include <ATen/div_rtn.h>
#include <ATen/native/TensorIterator.h>
#include <torch/library.h>

#include <ATen/native/xpu/sycl/LinearInt4.h>
#include <comm/xpu_aten.h>

namespace at::native {
Tensor _weight_int4pack_mm_xpu(
const Tensor& A,
const Tensor& B,
int64_t qGroupSize,
const Tensor& qScaleAndZeros) {
auto M = A.size(0);
auto N = B.size(0) * 8;
auto K = A.size(1);
TORCH_CHECK(
A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
__func__,
" : expect A to be either 32-bit or 16-bit float tensor.");
TORCH_CHECK(A.is_contiguous(), __func__, " : expect A to be contiguous.");
TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");

TORCH_CHECK(
B.dtype() == kInt || B.dtype() == kUInt32,
__func__,
" : expect B to be int32 or uint32 tensor.");
TORCH_CHECK(B.is_contiguous(), __func__, " : expect B to be contiguous.");
TORCH_CHECK(B.dim() == 4, __func__, " : expect B to 4d tensor.");

TORCH_CHECK(
qGroupSize == 32 || qGroupSize == 64 || qGroupSize == 128 ||
qGroupSize == 256,
__func__,
": expect qGroupSize to be 32, 64, 128 or 256, got ",
qGroupSize);

// TORCH_CHECK(
// qScaleAndZeros.dim() == 3 && qScaleAndZeros.size(1) == N &&
// qScaleAndZeros.size(2) == 2,
// __func__,
// ": expect qScaleAndZeros to be 3d tensor with sizes [:, ",
// N,
// ", 2]");

std::optional<Device> common_device = std::nullopt;
c10::impl::check_and_update_common_device(
common_device, A, "xpu::_weight_int4pack_mm", "A");
c10::impl::check_and_update_common_device(
common_device, B, "xpu::_weight_int4pack_mm", "B");
c10::impl::check_and_update_common_device(
common_device,
qScaleAndZeros,
"xpu::_weight_int4pack_mm",
"qScaleAndZeros");
Tensor C = at::empty({M, N}, A.options());

at::native::xpu::linear_int4_kernel(A, B, qGroupSize, qScaleAndZeros, C);
return C;
}
} // namespace at::native
6 changes: 3 additions & 3 deletions src/ATen/native/xpu/RreluWithNoise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace native {

Tensor& rrelu_with_noise_out_xpu(
const Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand All @@ -18,7 +18,7 @@ Tensor& rrelu_with_noise_out_xpu(

Tensor rrelu_with_noise_xpu(
const Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand All @@ -30,7 +30,7 @@ Tensor rrelu_with_noise_xpu(

Tensor& rrelu_with_noise_xpu_(
Tensor& self,
const Tensor& noise,
Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
Expand Down
9 changes: 5 additions & 4 deletions src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
#include <ATen/Dispatch.h>
#include <ATen/NumericUtils.h>
#include <ATen/native/TensorIterator.h>

#include <ATen/native/xpu/sycl/Loops.h>

#include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
#include <ATen/native/xpu/sycl/Loops.h>

namespace at::native::xpu {

template <typename scalar_t>
struct SoftshrinkFunctor {
scalar_t operator()(scalar_t a) const {
return a > lambd_ ? a - lambd_ : (a < -lambd_ ? a + lambd_ : scalar_t(0));
return at::_isnan(a)
? a
: (a > lambd_ ? a - lambd_ : (a < -lambd_ ? a + lambd_ : scalar_t(0)));
}

SoftshrinkFunctor(scalar_t lambd) : lambd_(lambd) {}
Expand Down
Loading