intel · PenghuiCheng · Dec 12, 2024 · Dec 13, 2024 · Dec 16, 2024 · Dec 17, 2024
diff --git a/src/ATen/native/xpu/sycl/Distributions.cpp b/src/ATen/native/xpu/sycl/Distributions.cpp
@@ -126,7 +126,7 @@ void launch_binomial_kernel(TensorIteratorBase& iter, XPUGeneratorImpl* gen) {
 
 template <typename scalar_t, typename accscalar_t>
 struct GammaTensorApplyFunctor {
-  void operator()(
+  [[clang::optnone]] void operator()(
       sycl::nd_item<1> item,
       scalar_t& ret_val,
       const scalar_t& alpha) const {

diff --git a/test/xpu/extended/skip_list_arc.py b/test/xpu/extended/skip_list_arc.py
@@ -7,5 +7,21 @@
         "test_compare_cpu_bincount_xpu_int64",
         "test_compare_cpu_bincount_xpu_int8",
         "test_compare_cpu_bincount_xpu_uint8",
+        # RuntimeError: Kernel is incompatible with all devices in devs
+        # https://github.com/intel/torch-xpu-ops/issues/1150
+        "test_compare_cpu_logcumsumexp_xpu_float16",
+        "test_compare_cpu_logcumsumexp_xpu_float32",
+        "test_compare_cpu_nn_functional_pdist_xpu_float32",
+        "test_compare_cpu_tril_indices_xpu_int32",
+        "test_compare_cpu_tril_indices_xpu_int64",
+        "test_compare_cpu_triu_indices_xpu_int32",
+        "test_compare_cpu_triu_indices_xpu_int64",
+        "test_backward_logcumsumexp_xpu_float32",
+        "test_backward_nn_functional_pdist_xpu_float32",
+        "test_forward_ad_logcumsumexp_xpu_float32",
+        "test_operator_logcumsumexp_xpu_float32",
+        "test_operator_nn_functional_pdist_xpu_float32",
+        "test_view_replay_logcumsumexp_xpu_float32",
+        "test_view_replay_nn_functional_pdist_xpu_float32",
     ),
 }
diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py
@@ -194,5 +194,9 @@
     # Greatest absolute difference: 0.0625 at index (1,) (up to 0.001 allowed)
     #  Greatest relative difference: 0.00640869140625 at index (1,) (up to 0.001 allowed)
     "test_compare_cpu_xlogy_xpu_bfloat16",
+    "test_compare_cpu_div_trunc_rounding_xpu_float64",
+    "test_compare_cpu_div_trunc_rounding_xpu_float16",
+    "test_compare_cpu_div_floor_rounding_xpu_float16",
+    "test_compare_cpu_div_floor_rounding_xpu_bfloat16",
     ),
 }
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
@@ -1548,6 +1548,8 @@
         # XPU does not support tunable.
         "test_bmm_tunableop_rocm_xpu_float32",
         "test_numeric_check_leak_tunableop_rocm_xpu_float32",
+        "test_dump_results_on_exit_tunableop_xpu_float32",
+        "test_rotating_buffer_tunableop_xpu_float32",
         # CUDA bias cases added in latest PyTorch
         # AttributeError: module 'torch._C' has no attribute '_cuda_tunableop_enable'
         "test_matmul_check_entries_tunableop_xpu_float16",

diff --git a/test/xpu/test_unary_ufuncs_xpu.py b/test/xpu/test_unary_ufuncs_xpu.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: intel"]
 
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+import torch
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyXPU
 from torch.testing._internal.common_utils import run_tests
 
 try:
@@ -11,6 +12,38 @@
 with XPUPatchForImport(False):
     from test_unary_ufuncs import TestUnaryUfuncs
 
+    @onlyXPU
+    def _nonzero_static_large(self, device):
+        # large enough to have multiple iters per SM even on H100
+        # with 132 sms
+        size_inp = 1024 * 16 * 132 + 1024 * 16
+        x = torch.zeros(size_inp, device=device)
+        # unique indices
+        indices = torch.randperm(size_inp, device=device)[: size_inp // 2]
+        sorted, _ = torch.sort(indices)
+        x[sorted] = 1
+        res = torch.nonzero_static(x, size=size_inp // 2).view(-1)
+        self.assertEqual(res, sorted)
+        # no oob writes
+        out = torch.full((size_inp,), 10, device=device, dtype=torch.int64)
+        res = torch.nonzero_static(x, size=size_inp // 4, out=out[: size_inp // 2])
+        self.assertEqual(out[: size_inp // 4], sorted[: size_inp // 4])
+        self.assertEqual(
+            out[size_inp // 4 :],
+            torch.tensor(10, device="xpu").expand_as(out[size_inp // 4 :]),
+        )
+        # correct fill for 2d
+        x = x.view(2, size_inp // 2)
+        ref = x.nonzero()
+        res = x.nonzero_static(size=size_inp // 2 + 2)
+        self.assertEqual(res.shape, [size_inp // 2 + 2, 2])
+        self.assertEqual(ref, res[: size_inp // 2])
+        self.assertEqual(
+            res[size_inp // 2 :],
+            torch.tensor(-1, device="xpu").expand_as(res[size_inp // 2 :]),
+        )
+    TestUnaryUfuncs.test_nonzero_static_large = _nonzero_static_large
+
 instantiate_device_type_tests(TestUnaryUfuncs, globals(),only_for=("xpu"), allow_xpu=True)
 
 if __name__ == "__main__":