intel · sunjiweiswift · Nov 22, 2024 · Dec 3, 2024 · Dec 5, 2024 · Nov 22, 2024
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
@@ -13,6 +13,8 @@
         "https://github.com/pytorch/pytorch/pull/126516",
         # Modify the tolerance level in TIMM benchmark
         "https://github.com/pytorch/pytorch/pull/129735",
+        # [XPU] Update XPU C Shim Header
+        "https://github.com/pytorch/pytorch/pull/141086",
     ]
 )
 parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])

diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
@@ -1,3 +1,4 @@
 #!/bin/bash
-source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+source /opt/intel/oneapi/compiler/latest/env/vars.sh
+source /opt/intel/oneapi/umf/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -97,7 +97,7 @@ jobs:
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
           source .github/scripts/env.sh
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           cd ../pytorch
           if [[ ${{ inputs.abi }} == '0' ]]; then
             export _GLIBCXX_USE_CXX11_ABI=0

diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -123,7 +123,7 @@ jobs:
           conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
           conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
           source activate e2e_ci
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           pip install pandas scipy tqdm
       - name: Prepare Stock Pytorch
         run: |

diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml
@@ -125,7 +125,7 @@ jobs:
           conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
           conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
           source activate e2e_ci
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           pip install pandas scipy tqdm
       - name: Prepare Stock Pytorch
         run: |

diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
@@ -98,7 +98,7 @@ jobs:
           conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
           conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
           source activate e2e_ci
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           pip install pandas scipy tqdm
       - name: Prepare Stock Pytorch
         run: |

diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -122,7 +122,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")
 
   if(WIN32)
-    set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h")
+    set(AOT_TARGETS "ats-m150,mtl-u,mtl-h,xe2-lpg,xe2-hpg")
   else()
     set(AOT_TARGETS "pvc,xe-lpg,ats-m150")
   endif()

diff --git a/src/ATen/native/xpu/LinearInt4.cpp b/src/ATen/native/xpu/LinearInt4.cpp
@@ -0,0 +1,63 @@
+
+#include <ATen/core/op_registration/adaption.h>
+#include <ATen/div_rtn.h>
+#include <ATen/native/TensorIterator.h>
+#include <torch/library.h>
+
+#include <ATen/native/xpu/sycl/LinearInt4.h>
+#include <comm/xpu_aten.h>
+
+namespace at::native {
+Tensor _weight_int4pack_mm_xpu(
+    const Tensor& A,
+    const Tensor& B,
+    int64_t qGroupSize,
+    const Tensor& qScaleAndZeros) {
+  auto M = A.size(0);
+  auto N = B.size(0) * 8;
+  auto K = A.size(1);
+  TORCH_CHECK(
+      A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      __func__,
+      " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.is_contiguous(), __func__, " : expect A to be contiguous.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+
+  TORCH_CHECK(
+      B.dtype() == kInt || B.dtype() == kUInt32,
+      __func__,
+      " : expect B to be int32 or uint32 tensor.");
+  TORCH_CHECK(B.is_contiguous(), __func__, " : expect B to be contiguous.");
+  TORCH_CHECK(B.dim() == 4, __func__, " : expect B to 4d tensor.");
+
+  TORCH_CHECK(
+      qGroupSize == 32 || qGroupSize == 64 || qGroupSize == 128 ||
+          qGroupSize == 256,
+      __func__,
+      ": expect qGroupSize to be 32, 64, 128 or 256, got ",
+      qGroupSize);
+
+  //   TORCH_CHECK(
+  //       qScaleAndZeros.dim() == 3 && qScaleAndZeros.size(1) == N &&
+  //           qScaleAndZeros.size(2) == 2,
+  //       __func__,
+  //       ": expect qScaleAndZeros to be 3d tensor with sizes [:, ",
+  //       N,
+  //       ", 2]");
+
+  std::optional<Device> common_device = std::nullopt;
+  c10::impl::check_and_update_common_device(
+      common_device, A, "xpu::_weight_int4pack_mm", "A");
+  c10::impl::check_and_update_common_device(
+      common_device, B, "xpu::_weight_int4pack_mm", "B");
+  c10::impl::check_and_update_common_device(
+      common_device,
+      qScaleAndZeros,
+      "xpu::_weight_int4pack_mm",
+      "qScaleAndZeros");
+  Tensor C = at::empty({M, N}, A.options());
+
+  at::native::xpu::linear_int4_kernel(A, B, qGroupSize, qScaleAndZeros, C);
+  return C;
+}
+} // namespace at::native
diff --git a/src/ATen/native/xpu/RreluWithNoise.cpp b/src/ATen/native/xpu/RreluWithNoise.cpp
@@ -6,7 +6,7 @@ namespace native {
 
 Tensor& rrelu_with_noise_out_xpu(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
@@ -18,7 +18,7 @@ Tensor& rrelu_with_noise_out_xpu(
 
 Tensor rrelu_with_noise_xpu(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
@@ -30,7 +30,7 @@ Tensor rrelu_with_noise_xpu(
 
 Tensor& rrelu_with_noise_xpu_(
     Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,

diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
@@ -1,16 +1,17 @@
 #include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
 #include <ATen/native/TensorIterator.h>
-
-#include <ATen/native/xpu/sycl/Loops.h>
-
 #include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
+#include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
 
 template <typename scalar_t>
 struct SoftshrinkFunctor {
   scalar_t operator()(scalar_t a) const {
-    return a > lambd_ ? a - lambd_ : (a < -lambd_ ? a + lambd_ : scalar_t(0));
+    return at::_isnan(a)
+        ? a
+        : (a > lambd_ ? a - lambd_ : (a < -lambd_ ? a + lambd_ : scalar_t(0)));
   }
 
   SoftshrinkFunctor(scalar_t lambd) : lambd_(lambd) {}