From 76597195454d4df4fd7aa2e3fdd550ed388aebf1 Mon Sep 17 00:00:00 2001 From: Cheng Date: Thu, 12 Dec 2024 01:43:15 +0000 Subject: [PATCH 1/4] extend UT test_nonzero_static_large to XPU device Signed-off-by: Cheng --- test/xpu/test_unary_ufuncs_xpu.py | 35 ++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/test/xpu/test_unary_ufuncs_xpu.py b/test/xpu/test_unary_ufuncs_xpu.py index 0e05a8e7c..a6c12a2ad 100644 --- a/test/xpu/test_unary_ufuncs_xpu.py +++ b/test/xpu/test_unary_ufuncs_xpu.py @@ -1,6 +1,7 @@ # Owner(s): ["module: intel"] -from torch.testing._internal.common_device_type import instantiate_device_type_tests +import torch +from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyXPU from torch.testing._internal.common_utils import run_tests try: @@ -11,6 +12,38 @@ with XPUPatchForImport(False): from test_unary_ufuncs import TestUnaryUfuncs + @onlyXPU + def _nonzero_static_large(self, device): + # large enough to have multiple iters per SM even on H100 + # with 132 sms + size_inp = 1024 * 16 * 132 + 1024 * 16 + x = torch.zeros(size_inp, device=device) + # unique indices + indices = torch.randperm(size_inp, device=device)[: size_inp // 2] + sorted, _ = torch.sort(indices) + x[sorted] = 1 + res = torch.nonzero_static(x, size=size_inp // 2).view(-1) + self.assertEqual(res, sorted) + # no oob writes + out = torch.full((size_inp,), 10, device=device, dtype=torch.int64) + res = torch.nonzero_static(x, size=size_inp // 4, out=out[: size_inp // 2]) + self.assertEqual(out[: size_inp // 4], sorted[: size_inp // 4]) + self.assertEqual( + out[size_inp // 4 :], + torch.tensor(10, device="xpu").expand_as(out[size_inp // 4 :]), + ) + # correct fill for 2d + x = x.view(2, size_inp // 2) + ref = x.nonzero() + res = x.nonzero_static(size=size_inp // 2 + 2) + self.assertEqual(res.shape, [size_inp // 2 + 2, 2]) + self.assertEqual(ref, res[: size_inp // 2]) + self.assertEqual( + res[size_inp // 2 :], + torch.tensor(-1, device="xpu").expand_as(res[size_inp // 2 :]), + ) + TestUnaryUfuncs.test_nonzero_static_large = _nonzero_static_large + instantiate_device_type_tests(TestUnaryUfuncs, globals(),only_for=("xpu"), allow_xpu=True) if __name__ == "__main__": From dfe985bc8759c3d6e5046717c398c7af4dbd2413 Mon Sep 17 00:00:00 2001 From: Cheng Date: Mon, 16 Dec 2024 08:42:08 +0000 Subject: [PATCH 2/4] Skip UT case test_dump_results_on_exit_tunableop since XPU unsupport tunable matmul Signed-off-by: Cheng --- test/xpu/skip_list_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index e1903f871..d452c43b0 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -1548,6 +1548,7 @@ # XPU does not support tunable. "test_bmm_tunableop_rocm_xpu_float32", "test_numeric_check_leak_tunableop_rocm_xpu_float32", + "test_dump_results_on_exit_tunableop_xpu_float32", # CUDA bias cases added in latest PyTorch # AttributeError: module 'torch._C' has no attribute '_cuda_tunableop_enable' "test_matmul_check_entries_tunableop_xpu_float16", From 9616f6deb0398839cc315210e20e6b17aa0b3270 Mon Sep 17 00:00:00 2001 From: Cheng Date: Tue, 17 Dec 2024 04:04:54 +0000 Subject: [PATCH 3/4] skip UT cases Signed-off-by: Cheng --- test/xpu/extended/skip_list_arc.py | 16 ++++++++++++++++ test/xpu/extended/skip_list_common.py | 4 ++++ test/xpu/skip_list_common.py | 1 + 3 files changed, 21 insertions(+) diff --git a/test/xpu/extended/skip_list_arc.py b/test/xpu/extended/skip_list_arc.py index e1e701b84..c8e26ccf3 100644 --- a/test/xpu/extended/skip_list_arc.py +++ b/test/xpu/extended/skip_list_arc.py @@ -7,5 +7,21 @@ "test_compare_cpu_bincount_xpu_int64", "test_compare_cpu_bincount_xpu_int8", "test_compare_cpu_bincount_xpu_uint8", + # RuntimeError: Kernel is incompatible with all devices in devs + # https://github.com/intel/torch-xpu-ops/issues/1150 + "test_compare_cpu_logcumsumexp_xpu_float16", + "test_compare_cpu_logcumsumexp_xpu_float32", + "test_compare_cpu_nn_functional_pdist_xpu_float32", + "test_compare_cpu_tril_indices_xpu_int32", + "test_compare_cpu_tril_indices_xpu_int64", + "test_compare_cpu_triu_indices_xpu_int32", + "test_compare_cpu_triu_indices_xpu_int64", + "test_backward_logcumsumexp_xpu_float32", + "test_backward_nn_functional_pdist_xpu_float32", + "test_forward_ad_logcumsumexp_xpu_float32", + "test_operator_logcumsumexp_xpu_float32", + "test_operator_nn_functional_pdist_xpu_float32", + "test_view_replay_logcumsumexp_xpu_float32", + "test_view_replay_nn_functional_pdist_xpu_float32", ), } diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py index 6b5fd653e..643d631eb 100644 --- a/test/xpu/extended/skip_list_common.py +++ b/test/xpu/extended/skip_list_common.py @@ -194,5 +194,9 @@ # Greatest absolute difference: 0.0625 at index (1,) (up to 0.001 allowed) # Greatest relative difference: 0.00640869140625 at index (1,) (up to 0.001 allowed) "test_compare_cpu_xlogy_xpu_bfloat16", + "test_compare_cpu_div_trunc_rounding_xpu_float64", + "test_compare_cpu_div_trunc_rounding_xpu_float16", + "test_compare_cpu_div_floor_rounding_xpu_float16", + "test_compare_cpu_div_floor_rounding_xpu_bfloat16", ), } diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index d452c43b0..cf14f6c22 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -1549,6 +1549,7 @@ "test_bmm_tunableop_rocm_xpu_float32", "test_numeric_check_leak_tunableop_rocm_xpu_float32", "test_dump_results_on_exit_tunableop_xpu_float32", + "test_rotating_buffer_tunableop_xpu_float32", # CUDA bias cases added in latest PyTorch # AttributeError: module 'torch._C' has no attribute '_cuda_tunableop_enable' "test_matmul_check_entries_tunableop_xpu_float16", From 81f56fb892dd4dbeda80585c76343d83d8c5e501 Mon Sep 17 00:00:00 2001 From: Cheng Date: Tue, 17 Dec 2024 10:02:22 +0000 Subject: [PATCH 4/4] disable optimization for gamma Signed-off-by: Cheng --- src/ATen/native/xpu/sycl/Distributions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/Distributions.cpp b/src/ATen/native/xpu/sycl/Distributions.cpp index 5e32e3b95..dda529820 100644 --- a/src/ATen/native/xpu/sycl/Distributions.cpp +++ b/src/ATen/native/xpu/sycl/Distributions.cpp @@ -126,7 +126,7 @@ void launch_binomial_kernel(TensorIteratorBase& iter, XPUGeneratorImpl* gen) { template struct GammaTensorApplyFunctor { - void operator()( + [[clang::optnone]] void operator()( sycl::nd_item<1> item, scalar_t& ret_val, const scalar_t& alpha) const {