From f08bd5182df894019cfd07f04f7b2125750af713 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Fri, 1 Nov 2024 09:28:52 -0700 Subject: [PATCH] Add cuda arch guard to skip ampere matmul tests on Hopper GPUs (#3324) This PR adds `NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD` to the Ampere matmul tests because the Hopper `MultiMatmulScheduler` will not support them. --- tests/cpp/test_matmul.cpp | 51 +++++++++++++------- tests/cpp/test_matmul_sass.cpp | 2 + tests/cpp/test_matmul_scheduler.cpp | 72 ++++++++++++++--------------- 3 files changed, 73 insertions(+), 52 deletions(-) diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp index dcde07275d7..1c0fca0ac89 100644 --- a/tests/cpp/test_matmul.cpp +++ b/tests/cpp/test_matmul.cpp @@ -84,6 +84,8 @@ using namespace at::indexing; // Matmul test for Ampere MMA: across supported layouts TEST_P(MatmulTestWithLayout, AmpereMatmul) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -204,6 +206,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBroadcastBatch) { } TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -258,6 +262,8 @@ TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) { } TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -319,6 +325,8 @@ TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) { } TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -377,6 +385,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) { // Matmul test for Ampere MMA: with pipelined gmem load TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; REQUIRE_DEVICE_SMEM_SIZE(70 << 10, 0); @@ -439,6 +449,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) { // Matmul test for Ampere MMA: checking CTA Swizzles TEST_P(MatmulTestWithLayout, AmpereSwizzle) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int dim = 8192; int M = dim, N = dim, K = dim; @@ -585,6 +597,8 @@ TEST_P(MatmulTestWithLayout, AmpereSwizzle) { } TEST_P(MatmulTestWithLayout, AmpereMatmulRegCircularBuffer) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; REQUIRE_DEVICE_SMEM_SIZE(70 << 10, 0); @@ -1317,6 +1331,8 @@ TEST_F(MatmulTest, MatmulSoftmaxMatmulAmpere) { // Matmul test for Turing MMA: across supported layouts TEST_P(MatmulTestWithLayout, TuringMatmul) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -2036,6 +2052,8 @@ TEST_F(MatmulTest, AmpereMatmulTNSwizzled) { // Matmul test on Ampere using ldmatrix.x4 to load operands TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + REQUIRE_DEVICE_SMEM_SIZE(98384, 0); // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -2093,6 +2111,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) { // Matmul test for Turing MMA: across supported layouts TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -2147,6 +2167,8 @@ TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) { // Tile layout check for symmetric 4-warp recipes TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck4warp) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + REQUIRE_DEVICE_SMEM_SIZE(98384, 0); // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -2225,6 +2247,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck4warp) { } TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + REQUIRE_DEVICE_SMEM_SIZE(98384, 0); // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -2298,6 +2322,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) { } TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + REQUIRE_DEVICE_SMEM_SIZE(98384, 0); // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; @@ -2366,6 +2392,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) { // Matmul test on Ampere using ldmatrix.x4 to load operands TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 2048; Fusion fusion; @@ -2423,6 +2451,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) { // Matmul test for Ampere MMA: across supported layouts TEST_P(MatmulTestWithLayout, AmpereSplitKLikeStridedBatchedMatmul) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); + // Keep multiples of 8 to keep vectorizable. int B = 2, M = 504, N = 136, K = 248; @@ -2916,10 +2946,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueRelu) { // Test the matmul scheduler's single-kernel split-K support TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitK_CUDA) { - // requires Ampere or higher GPU - if (!deviceMajorMinorCheck(8)) { - GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; - } + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0); // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 8096; @@ -2994,10 +3021,7 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitK_CUDA) { // Test splitk with bias epilogue TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitKBias_CUDA) { - // requires Ampere or higher GPU - if (!deviceMajorMinorCheck(8)) { - GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; - } + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0); // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 8096; @@ -3063,10 +3087,7 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitKBias_CUDA) { // Same as above but has a batch dimension and splitk TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitK) { - // requires Ampere or higher GPU - if (!deviceMajorMinorCheck(8)) { - GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; - } + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0); // Keep multiples of 8 to keep vectorizable. int B = 2, M = 504, N = 136, K = 2048; @@ -3128,10 +3149,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitK) { // Test batch splitk with bias epilogue TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitKBias) { - // requires Ampere or higher GPU - if (!deviceMajorMinorCheck(8)) { - GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; - } + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0); // Keep multiples of 8 to keep vectorizable. int B = 2, M = 504, N = 136, K = 2048; @@ -3259,6 +3277,7 @@ TEST_F(MatmulTest, ReproIssue1808) { // Test matmul with sizes that are not divisible by 8 and with misaligned inputs TEST_P(MatmulTestWithLayout, MisalignedVectorization) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0); for (bool add_2d_bias : {false, true}) { for (bool downcast_output : {false, true}) { for (const auto& [M, N, K, alignA, alignB, alignBias] : std::vector< diff --git a/tests/cpp/test_matmul_sass.cpp b/tests/cpp/test_matmul_sass.cpp index 60012b2136e..974300401d2 100644 --- a/tests/cpp/test_matmul_sass.cpp +++ b/tests/cpp/test_matmul_sass.cpp @@ -184,6 +184,8 @@ sass::Container getBinaryOpMulEpilogueSASSFor( } // namespace TEST_P(MatmulSASSTestWithLayout, AmpereSanity) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0); + // Keep multiples of 8 to keep vectorizable. int M = 504, N = 136, K = 248; diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp index 9c249243dc3..e532f32ef57 100644 --- a/tests/cpp/test_matmul_scheduler.cpp +++ b/tests/cpp/test_matmul_scheduler.cpp @@ -127,7 +127,7 @@ void checkUnsegmentedVectorization( // D = (A x B) + bias // Target architectures: Turing, Ampere TEST_P(PrecisionParametrizedTest, EpilogueBias) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -227,7 +227,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueBias) { // D = relu(A x B) // Target architectures: Turing, Ampere TEST_P(PrecisionParametrizedTest, EpilogueRelu) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -312,7 +312,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueRelu) { // Target architectures: Ampere TEST_P(PrecisionParametrizedTest, EpilogueBiasRelu) { // NOTE: test skips Turing arch, the relative error was too big - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -416,7 +416,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueBiasRelu) { // Aux = relu(D) // Target architectures: Turing, Ampere TEST_P(PrecisionParametrizedTest, EpilogueReluAux) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -507,7 +507,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueReluAux) { // Target architectures: Ampere TEST_P(PrecisionParametrizedTest, EpilogueBiasReluAux) { // NOTE: test skips Turing arch, the relative error was too big - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -616,7 +616,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueBiasReluAux) { // D = gelu(A x B) // Target architectures: Turing, Ampere TEST_P(PrecisionParametrizedTest, EpilogueGelu) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -700,7 +700,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueGelu) { // Aux = gelu(D) // Target architectures: Turing, Ampere TEST_P(PrecisionParametrizedTest, EpilogueGeluAux) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -789,7 +789,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueGeluAux) { // D = gelu((A x B) + bias) // Target architectures: Turing, Ampere TEST_P(PrecisionParametrizedTest, EpilogueBiasGelu) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -893,7 +893,7 @@ TEST_P(PrecisionParametrizedTest, EpilogueBiasGelu) { // Target architectures: Ampere TEST_P(PrecisionParametrizedTest, EpilogueBiasGeluAux) { // NOTE: test skips Turing arch, the relative error was too big - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; static TestCaseErrorThresholds errs = { @@ -1012,7 +1012,7 @@ INSTANTIATE_TEST_SUITE_P( }); TEST_F(MatmulSchedulerTest, FusedMultiplySumOnly) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1096,7 +1096,7 @@ TEST_F(MatmulSchedulerTest, BasicMatmulStrictCheckTT) { // Matmul test that reslies on segmenter for 'C = A x B' fusion, for Ampere TEST_P(MatmulSchedulerTestWithLayout, BasicMatmulRelaxedCheck) { // skip until we have Hopper support - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const int M = 504, N = 136, K = 2048; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1143,7 +1143,7 @@ TEST_P(MatmulSchedulerTestWithLayout, BasicMatmulRelaxedCheck) { // MMA second input is passed as first fusion parameter. TEST_F(MatmulSchedulerTest, BasicMatmulInputShuffledTT) { // skip until we have Hopper support - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const int M = 504, N = 136, K = 2048; const auto layout = MmaLayout::TT; auto fusion = std::make_unique(); @@ -1189,7 +1189,7 @@ TEST_F(MatmulSchedulerTest, BasicMatmulInputShuffledTT) { // Matmul test that uses segmenter for 'C = float2half(A x B)' fusion, for // Ampere TEST_F(MatmulSchedulerTest, EpilogueOutputCast) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1240,7 +1240,7 @@ TEST_F(MatmulSchedulerTest, EpilogueOutputCast) { // Matmul test that uses segmenter for 'C = alpha * (A x B)' fusion, for // Ampere TEST_F(MatmulSchedulerTest, EpilogueAlpha) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1294,7 +1294,7 @@ TEST_F(MatmulSchedulerTest, EpilogueAlpha) { // Matmul test that uses segmenter for 'C = float2half(alpha * (A x B))' // fusion, for Ampere TEST_F(MatmulSchedulerTest, EpilogueAlphaOutputCast) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1350,7 +1350,7 @@ TEST_F(MatmulSchedulerTest, EpilogueAlphaOutputCast) { // Matmul test that uses segmenter for fusion for Ampere: // D = (A x B) + beta * C TEST_F(MatmulSchedulerTest, EpilogueBeta) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1419,7 +1419,7 @@ TEST_F(MatmulSchedulerTest, EpilogueBeta) { // Matmul test that uses segmenter for fusion for Ampere: // D = alpha * (A x B) + beta * C TEST_F(MatmulSchedulerTest, EpilogueAlphaBeta) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1493,7 +1493,7 @@ TEST_F(MatmulSchedulerTest, EpilogueAlphaBeta) { // Matmul test that uses segmenter for fusion for Ampere: // D = gelu(alpha * (A x B) + beta * C) TEST_F(MatmulSchedulerTest, EpilogueAlphaBetaGeluOutputCast) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const auto layout = MmaLayout::TT; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1659,7 +1659,7 @@ TEST_F(MatmulSchedulerTest, EpilogueAlphaBetaBias) { // Strided batch gemm test taht uses matmul scheduler, for Ampere: // D = (A x B) TEST_P(MatmulSchedulerTestWithLayout, StridedBatch) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const int M = 504, N = 136, K = 248, B = 2; auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1711,7 +1711,7 @@ TEST_P(MatmulSchedulerTestWithLayout, StridedBatch) { // for Ampere architecture: // D = alpha * (A x B) + beta * C TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueAlphaBeta) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const int M = 504, N = 136, K = 248, B = 2; auto fusion = std::make_unique(); @@ -1785,7 +1785,7 @@ TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueAlphaBeta) { // there is only single C tensor for whole batch; test for Ampere architecture: // D = alpha * (A x B) + beta * C TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueAlphaSingleBeta) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const int M = 504, N = 136, K = 248, B = 2; auto fusion = std::make_unique(); @@ -1863,7 +1863,7 @@ TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueAlphaSingleBeta) { // Strided batch gemm test with bias that uses matmul scheduler, for Ampere: // D = (A x B) + bias TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueBias) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const int M = 504, N = 136, K = 248, B = 2; auto fusion = std::make_unique(); @@ -1923,7 +1923,7 @@ TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueBias) { // scheduler, for Ampere: // D = (A x B) + bias TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueSingleBias) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); const int M = 504, N = 136, K = 248, B = 2; auto fusion = std::make_unique(); @@ -1983,7 +1983,7 @@ TEST_P(MatmulSchedulerTestWithLayout, StridedBatchEpilogueSingleBias) { // Test matmul with contiguous inputs but sizes that are not divisible by 8 and // with misaligned input pointers TEST_P(MatmulSchedulerTestWithLayout, MisalignedVectorization) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); // TODO: parametrized test instead of nested loops (still use a loop over // sizes and re-use FusionExecutorCache) for (bool add_2d_bias : {false, true}) { @@ -2136,7 +2136,7 @@ TEST_P(MatmulSchedulerTestWithLayout, MisalignedVectorization) { // Test matmul with strided inputs. This tests that vectorization is properly // computed. TEST_P(MatmulSchedulerTestWithLayout, StridedInputs) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); for (bool add_2d_bias : {false, true}) { for (bool downcast_output : {false, true}) { auto run = [&](int M, @@ -2499,7 +2499,7 @@ TEST_F(MatmulSchedulerPluginTest, BasicMatmul) { // this test with all three combinations (with and without each scheduler, but // at least one enabled). TEST_F(MatmulSchedulerTest, SegmentMatmulOpPrologue) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -2539,7 +2539,7 @@ TEST_F(MatmulSchedulerTest, SegmentMatmulOpPrologue) { // This is just like the above test but with LinearOp instead of MatmulOp TEST_F(MatmulSchedulerTest, SegmentLinearOpPrologue) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -2580,7 +2580,7 @@ TEST_F(MatmulSchedulerTest, SegmentLinearOpPrologue) { // Test that the matmul scheduler refuses to translate a matmul that is not // Half or BFloat16 TEST_F(MatmulSchedulerTest, SegmentMatmulOpUnsupportedDtype) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -2636,7 +2636,7 @@ class MatmulFusionTest : public MatmulSchedulerTest, // Test that we can segment a Fusion containing two matmuls TEST_P(MatmulFusionTest, Llama2FFN) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -2787,7 +2787,7 @@ class AllocationDomainTest // [M, K] and [K, N], and all possible combinations of allocation domains. // Please note that inpout in B is transposed prior to creating a Mma op. TEST_P(AllocationDomainTest, BasicMatmul) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); bool a_m_inner = std::get<0>(GetParam()); bool b_k_inner = std::get<1>(GetParam()); @@ -2821,7 +2821,7 @@ TEST_P(AllocationDomainTest, BasicMatmul) { // Same as above but without the the input tv1 being transposed. TEST_P(AllocationDomainTest, BasicMatmulNoTranspose) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); bool a_m_inner = std::get<0>(GetParam()); bool b_k_inner = std::get<1>(GetParam()); @@ -2853,7 +2853,7 @@ TEST_P(AllocationDomainTest, BasicMatmulNoTranspose) { } TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSet) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); bool a_m_inner = std::get<0>(GetParam()); bool b_k_inner = std::get<1>(GetParam()); @@ -2889,7 +2889,7 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSet) { } TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSin) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); bool a_m_inner = std::get<0>(GetParam()); bool b_k_inner = std::get<1>(GetParam()); @@ -2928,7 +2928,7 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSin) { // Matmul test for Ampere MMA: across supported layouts TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinNoTranspose) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); bool a_m_inner = std::get<0>(GetParam()); bool b_k_inner = std::get<1>(GetParam()); @@ -2964,7 +2964,7 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinNoTranspose) { } TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinSetNoTranspose) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); bool a_m_inner = std::get<0>(GetParam()); bool b_k_inner = std::get<1>(GetParam()); @@ -3001,7 +3001,7 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinSetNoTranspose) { } TEST_P(AllocationDomainTest, MatmulWithPrologueSetCastSinTranspose) { - NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 10, 0); + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(7, 5, 9, 0); bool a_m_inner = std::get<0>(GetParam()); bool b_k_inner = std::get<1>(GetParam());