From 727ec7fc4459742b7b1edb0f5e865d552d10d821 Mon Sep 17 00:00:00 2001 From: protonu Date: Wed, 11 Dec 2024 14:22:02 -0800 Subject: [PATCH 1/5] adding a new unit test for mma+bias and propating schedules --- csrc/scheduler/hopper_multi_matmul.cpp | 118 ++++++++----------------- tests/cpp/test_matmul_scheduler.cpp | 74 +++++++++++++++- 2 files changed, 107 insertions(+), 85 deletions(-) diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp index e23587abf10..e57f0fdd19b 100644 --- a/csrc/scheduler/hopper_multi_matmul.cpp +++ b/csrc/scheduler/hopper_multi_matmul.cpp @@ -435,97 +435,45 @@ void HopperMultipleMatmulScheduler::scheduleMmaResults() { } void HopperMultipleMatmulScheduler::scheduleOutputTensor(TensorView* c) { - const MatMulTileOptions& gemm_tile = params_->tile_sizes; - const int64_t vectorization_factor = params_->supported_vec_size.epilogue; - // input tensor is in the form of [Mo,No,cta_tile_m,cta_tile_n] - mma_utils::checkConcreteStaticDim(c->axis(-2)); - mma_utils::checkConcreteStaticDim(c->axis(-1)); - const int64_t tile_size_m = c->axis(-2)->extent()->evaluate().as(); - const int64_t tile_size_n = c->axis(-1)->extent()->evaluate().as(); - NVF_ERROR( - tile_size_m == gemm_tile.cta_tile.m, - "Actual tile size at axis(-2) in output tensor is different from CTA tile size! Expected: ", - gemm_tile.cta_tile.m, - ", actual: ", - tile_size_m); - NVF_ERROR( - tile_size_n == gemm_tile.cta_tile.n, - "Actual tile size at axis(-1) in output tensor is different from CTA tile size! Expected: ", - gemm_tile.cta_tile.n, - ", actual: ", - tile_size_n); - const int64_t tot_elements = tile_size_m * tile_size_n; - constexpr int64_t warp_size = 32l; - const int64_t tidx = warp_size; - const int64_t tidy = gemm_tile.cta_tile.n / gemm_tile.warp_tile.n; - const int64_t tidz = gemm_tile.cta_tile.m / gemm_tile.warp_tile.m; - // step-1, merge last 2 dims - c->merge(-2); - // [Mo, No, m*n] - - // step-2, set vectorization to maximum - // We have fixed tidx, tidy, and tidz, so we need to make sure that the - // output tensor is divisible by tidx * tidy * tidz * vectorization_factor - NVF_ERROR( - tot_elements % (tidx * tidy * tidz * vectorization_factor) == 0, - "Output tensor cannot be fully vectorized! tot_elements:", - tot_elements, - ", tidx: ", - tidx, - ", tidy: ", - tidy, - ", tidz: ", - tidz, - ", vectorization_factor: ", - vectorization_factor); - c->split(-1, vectorization_factor); - c->axis(-1)->parallelize(ParallelType::Vectorize); - // [Mo, No, m*n/vect, vect] - - // step-3, Split out a warp for TIDx - c->split(-2, tidx); - c->axis(-2)->parallelize(ParallelType::TIDx); - // [Mo, No, m*n/vect/TIDx, TIDx, vect] - - // step-4, Split out for TIDy and TIDz - // TIDy = cta_tile_n/warp_tile_n - // TIDz = cta_tile_m/warp_tile_m - c->split(-3, tidy); - c->axis(-3)->parallelize(ParallelType::TIDy); - - c->split(-4, tidz); - c->axis(-4)->parallelize(ParallelType::TIDz); - // [Mo, No, m*n/vect/TIDx/TIDy/TIDz, TIDz, TIDy, TIDx, vect] - - for (TensorView* mma_result : mma_results_) { - // step-5, Parallel first 2 dims same as mma_result - scheduler_utils::parallelizeAllLike( - mma_result, - 2, - {c}, - {ParallelType::BIDx, ParallelType::BIDy, ParallelType::BIDz}); - } + // Block Schedule and Parallelize + blockTileTensors({c}); + parallelizeBlocks({c}); + + // Apply mma common transformation + c->split(-2, getM(params_->mma_macro)); + c->split(-1, getN(params_->mma_macro)); + // [..., Mo, No, Mio, Mii, Nio, Nii] + // -> [..., Mo, No, Mio, Nio, Mii, Nii] + c->reorder({{-3, -2}}); + c->merge(-4); + + // [...., Mii, Nii] -> + // [..., Mii/16, Miioi(2), Miii(8), Nii/8, Niio(4), Niii(2)] -> + // [.., Mii/16, Miii(8), Niio(4), Nii/8, Miioi(2), Niii(2) ] + // [..., Mii/16 * Miii(8) * Niio(4), Nii/8, Miioi(2), Niii(2) ] + // Mii/16 * Miii(8) * Niio(4) is 128 for Hopper and this is parallelized as + // TIDx. + auto s = + mma_utils::MmaSwizzler::scheduleMmaOutputAllocation(c->getLoopDomain()); + c->setLoopDomain(s.as()); + c->axis(-5)->parallelize(ParallelType::TIDy); } void HopperMultipleMatmulScheduler::scheduleEpilogue() { - // TODO: schedule epilogue by propagation backward from dc if (!params_->use_smem_epilogue) { for (Val* dv : fusion_->outputs()) { auto* d = dv->as(); NVF_ERROR(d->definition() && d->definition()->isA()); - auto* dc = d->definition()->input(0)->as(); - - std::vector tvs_to_schedule{d}; - if (std::find(mma_results_.begin(), mma_results_.end(), dc) == - mma_results_.end()) { - // Skip scheduling dc if it is an mma_result. This can happen if we are - // not casting back to half-precision in the output - tvs_to_schedule.push_back(dc); - } - // Block Schedule and Parallelize - blockTileTensors(tvs_to_schedule); - parallelizeBlocks(tvs_to_schedule); + // Schedule the output TV and propagate it back to the outputs of the Mma + // op. + scheduleOutputTensor(d); + scheduler_utils::BoundedDirectionalTransformPropagator::backward( + d, + -1, + mma_results_, + scheduler_utils::BoundedDirectionalTransformPropagator::Options() + .propagateParallelType()); // Apply mma common transformation for (auto tv : tvs_to_schedule) { @@ -534,8 +482,12 @@ void HopperMultipleMatmulScheduler::scheduleEpilogue() { tv->getLoopDomain()); tv->setLoopDomain(s.as()); } + // We don't respect vectorization_factor as yet. We vectorize the + // inner-dim with extent 2. + // TODO: support vectorization_factor. d->axis(-1)->parallelize(ParallelType::Vectorize); } + scheduleFusionInputsForEpilogue(); } else { constexpr int64_t stmatrix_tile_m = 16; constexpr int64_t stmatrix_tile_n = 16; diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp index a6ddb8d0ca8..848f02bc82f 100644 --- a/tests/cpp/test_matmul_scheduler.cpp +++ b/tests/cpp/test_matmul_scheduler.cpp @@ -3296,8 +3296,7 @@ class HopperMatmulSchedulerTest KernelExecutor ke; ke.compile(fusion, inputs, LaunchParams(), matmul_cparams); auto nvf_out = ke.run(inputs); - // NOTE Relax tolerances for split-k case - EXPECT_TRUE(at::allclose(nvf_out.at(0), tref, 1e-3, 1e-3)); + EXPECT_TRUE(at::allclose(nvf_out.at(0), tref, 1e-2, 1e-2)); } protected: @@ -3377,6 +3376,77 @@ TEST_P(HopperMatmulSchedulerTest, FusedMultiplySum) { tref = atMatmul(A.squeeze(), B.squeeze(), layout); } +TEST_P(HopperMatmulSchedulerTest, FusedMultiplySumBiasNeg) { + if (use_smem_epilogue) { + GTEST_SKIP() + << "TODO: We don't support smem epilogue in the Hopper matmul scheduler right now"; + } + const auto& [A, B] = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + const auto& C = matmulAtInput2D( + layout, TensorMatmulPos::Bias, data_type_to_aten(dtype), M, N, K); + inputs = {A, B, C}; + + TensorView* tv0 = nullptr; + TensorView* tv1 = nullptr; + std::unordered_map old2new; + int64_t k_axis = 0; + + switch (layout) { + case MmaLayout::TT: + // Inner dims KN, order is MKN + tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype); + tv1 = makeContigConcreteTensor({1, -1, -1}, dtype); + old2new = {{-2, -1}, {-1, -2}}; + k_axis = -2; + break; + case MmaLayout::TN: + // Inner dims KK, order is MNK + tv0 = makeContigConcreteTensor({-1, 1, -1}, dtype); + tv1 = makeContigConcreteTensor({1, -1, -1}, dtype); + old2new = {}; + k_axis = -1; + break; + case MmaLayout::NT: + // Inner dims MN, order is KMN + tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype); + tv1 = makeContigConcreteTensor({-1, 1, -1}, dtype); + old2new = {{-3, -1}}; + k_axis = -3; + break; + case MmaLayout::NN: + // Inner dims MK, order is NKM + tv0 = makeContigConcreteTensor({1, -1, -1}, dtype); + tv1 = makeContigConcreteTensor({-1, -1, 1}, dtype); + old2new = {{-1, -3}}; + k_axis = -2; + break; + } + TensorView* tv2 = makeContigConcreteTensor({-1}, dtype); + + fusion->addInput(tv0); + fusion->addInput(tv1); + fusion->addInput(tv2); + + auto tv3 = fusedMultiplySum(tv0, tv1, {k_axis}); + + // Reorder the accumulator as [M, N, K] + tv3->reorder(old2new); + tv3->commitLeafToLogical(); + + auto* tv4 = maybeCastOp(DataType::Float, tv2); + auto* tv5 = biasEpilogue(tv3, tv4); + auto* tv6 = neg(tv5); + auto* tv7 = castOp(dtype, tv6); + fusion->addOutput(tv7); + + tref = atBiasEpilogue( + atMatmul(A.squeeze(), B.squeeze(), layout), + C.to(data_type_to_aten(DataType::Float))) + .neg_() + .to(data_type_to_aten(DataType::Half)); +} + INSTANTIATE_TEST_SUITE_P( General, HopperMatmulSchedulerTest, From 03180d1d256e66a9dc0fc72187892a87afbb5dcb Mon Sep 17 00:00:00 2001 From: protonu Date: Fri, 13 Dec 2024 10:34:46 -0800 Subject: [PATCH 2/5] rebase and address reviewer comments --- csrc/scheduler/hopper_multi_matmul.cpp | 105 +++++++------------------ csrc/scheduler/hopper_multi_matmul.h | 7 -- tests/cpp/test_matmul_scheduler.cpp | 2 + 3 files changed, 30 insertions(+), 84 deletions(-) diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp index e57f0fdd19b..fb324d73251 100644 --- a/csrc/scheduler/hopper_multi_matmul.cpp +++ b/csrc/scheduler/hopper_multi_matmul.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -434,32 +435,21 @@ void HopperMultipleMatmulScheduler::scheduleMmaResults() { } } -void HopperMultipleMatmulScheduler::scheduleOutputTensor(TensorView* c) { - // Block Schedule and Parallelize - blockTileTensors({c}); - parallelizeBlocks({c}); - - // Apply mma common transformation - c->split(-2, getM(params_->mma_macro)); - c->split(-1, getN(params_->mma_macro)); - // [..., Mo, No, Mio, Mii, Nio, Nii] - // -> [..., Mo, No, Mio, Nio, Mii, Nii] - c->reorder({{-3, -2}}); - c->merge(-4); - - // [...., Mii, Nii] -> - // [..., Mii/16, Miioi(2), Miii(8), Nii/8, Niio(4), Niii(2)] -> - // [.., Mii/16, Miii(8), Niio(4), Nii/8, Miioi(2), Niii(2) ] - // [..., Mii/16 * Miii(8) * Niio(4), Nii/8, Miioi(2), Niii(2) ] - // Mii/16 * Miii(8) * Niio(4) is 128 for Hopper and this is parallelized as - // TIDx. - auto s = - mma_utils::MmaSwizzler::scheduleMmaOutputAllocation(c->getLoopDomain()); - c->setLoopDomain(s.as()); - c->axis(-5)->parallelize(ParallelType::TIDy); -} - void HopperMultipleMatmulScheduler::scheduleEpilogue() { + // Load/cache the epilogue inputs if there are any. + auto& c_tvs = tensor_roles_.at(MatmulTensorRole::EPILOGUE_INPUT); + std::vector cached_tvs; + for (auto* c : c_tvs) { + cached_tvs.push_back(c->cacheAfter()); + } + + // Propato to (not including) the splitk output if there is a splitk + // else this is just mma_results_ + std::vector propagate_to = splitk_sums_; + if (tensor_roles_.count(MatmulTensorRole::EPILOGUE_INPUT)) { + propagate_to.insert(propagate_to.end(), c_tvs.begin(), c_tvs.end()); + } + if (!params_->use_smem_epilogue) { for (Val* dv : fusion_->outputs()) { auto* d = dv->as(); @@ -467,27 +457,30 @@ void HopperMultipleMatmulScheduler::scheduleEpilogue() { // Schedule the output TV and propagate it back to the outputs of the Mma // op. - scheduleOutputTensor(d); + blockTileTensors({d}); + parallelizeBlocks({d}); + transformLikeMmaOutput(d, /*is_mma_result=*/false); + + auto s = mma_utils::MmaSwizzler::scheduleMmaOutputAllocation( + d->getLoopDomain()); + d->setLoopDomain(s.as()); + scheduler_utils::BoundedDirectionalTransformPropagator::backward( d, -1, - mma_results_, + propagate_to, scheduler_utils::BoundedDirectionalTransformPropagator::Options() .propagateParallelType()); - // Apply mma common transformation - for (auto tv : tvs_to_schedule) { - transformLikeMmaOutput(tv, /*is_mma_result=*/false); - auto s = mma_utils::MmaSwizzler::scheduleMmaOutputAllocation( - tv->getLoopDomain()); - tv->setLoopDomain(s.as()); - } // We don't respect vectorization_factor as yet. We vectorize the // inner-dim with extent 2. // TODO: support vectorization_factor. d->axis(-1)->parallelize(ParallelType::Vectorize); + scheduler_utils::parallelizeAllLike(d, -1, cached_tvs); + + // The cached EPILOGUE_INPUT tvs are not needed anymore + cached_tvs.clear(); } - scheduleFusionInputsForEpilogue(); } else { constexpr int64_t stmatrix_tile_m = 16; constexpr int64_t stmatrix_tile_n = 16; @@ -561,48 +554,6 @@ void HopperMultipleMatmulScheduler::scheduleEpilogue() { } } -//! Propagates transformations from fusion output to fusion tv inputs that are -//! producers in the epilogue. Transformations' propagation aims at input tvs -//! which are not assigned to core roles, that is, are not MMA inputs. -void HopperMultipleMatmulScheduler::scheduleFusionInputsForEpilogue() { - std::vector cached_tvs; - - // Handling transformations in fusion input tvs with assigned EPILOGUE_INPUT - // role by propagating fusion output transformations through cached views - // of EPILOGUE_INPUT fusion input tvs and by setting vectorization of the - // inner most iterdomain of these cached views - if (tensor_roles_.count(MatmulTensorRole::EPILOGUE_INPUT)) { - auto& c_tvs = tensor_roles_.at(MatmulTensorRole::EPILOGUE_INPUT); - - // The system supports only scenario where there is only one fusion output - // with assigned OUTPUT role, this condition is already verified so there - // is no need for an additional checks here - auto output_d = tensor_roles_.at(MatmulTensorRole::OUTPUT).front(); - for (auto* c : c_tvs) { - cached_tvs.push_back(c->cacheAfter()); - } - - scheduler_utils::BoundedDirectionalTransformPropagator::backward( - output_d, -1, c_tvs); - - std::unordered_set parallel_types = {}; - if (params_->use_smem_epilogue) { - // In cases where smem epilogue feature is enabled, the vectorization - // of domains will be propagated to fusion inputs that are epilogue - // inputs, this may result in unaligned memory reads. Vectorization is - // explicitly excluded form parallelization types to avoid this issue. - // This should be changed when vectorization analysis is available and - // enabled for matmul scheduler. - parallel_types = allParallelTypesExcept({ParallelType::Vectorize}); - } - scheduler_utils::parallelizeAllLike( - output_d, -1, cached_tvs, parallel_types); - - // The cached EPILOGUE_INPUT tvs are not needed anymore - cached_tvs.clear(); - } -} - void HopperMultipleMatmulScheduler::scheduleSplitKSum() { if (params_->splitk_factor == 1) { return; diff --git a/csrc/scheduler/hopper_multi_matmul.h b/csrc/scheduler/hopper_multi_matmul.h index 5eab0f4fbed..295b55ee96e 100644 --- a/csrc/scheduler/hopper_multi_matmul.h +++ b/csrc/scheduler/hopper_multi_matmul.h @@ -171,15 +171,8 @@ class HopperMultipleMatmulScheduler : public MultipleMatmulScheduler { void scheduleMmaResults(); - void scheduleOutputTensor(TensorView* c); - void scheduleEpilogue(); - //! Propagates transformations from fusion output to fusion tv inputs that are - //! producers in the epilogue. Transformations' propagation aims at input tvs - //! which are not assigned to core roles, that is, are not MMA inputs. - void scheduleFusionInputsForEpilogue(); - void scheduleSplitKSum(); void setUpInlining(); diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp index 848f02bc82f..0ffde4364c1 100644 --- a/tests/cpp/test_matmul_scheduler.cpp +++ b/tests/cpp/test_matmul_scheduler.cpp @@ -3376,6 +3376,8 @@ TEST_P(HopperMatmulSchedulerTest, FusedMultiplySum) { tref = atMatmul(A.squeeze(), B.squeeze(), layout); } +// TODO: Remove this test once the architecture agnostic can be +// run on hopper. TEST_P(HopperMatmulSchedulerTest, FusedMultiplySumBiasNeg) { if (use_smem_epilogue) { GTEST_SKIP() From 34ec5e156fc8edee51fe1004744f4be78fb88241 Mon Sep 17 00:00:00 2001 From: protonu Date: Fri, 13 Dec 2024 10:42:42 -0800 Subject: [PATCH 3/5] removing header --- csrc/scheduler/hopper_multi_matmul.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp index fb324d73251..8a05138fb0d 100644 --- a/csrc/scheduler/hopper_multi_matmul.cpp +++ b/csrc/scheduler/hopper_multi_matmul.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include From d87495b8b30b63ad440c83625d00c9040a99994d Mon Sep 17 00:00:00 2001 From: protonu Date: Fri, 13 Dec 2024 11:56:30 -0800 Subject: [PATCH 4/5] addressing reviewer comments --- csrc/scheduler/hopper_multi_matmul.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp index 8a05138fb0d..98bf7cff37c 100644 --- a/csrc/scheduler/hopper_multi_matmul.cpp +++ b/csrc/scheduler/hopper_multi_matmul.cpp @@ -435,17 +435,19 @@ void HopperMultipleMatmulScheduler::scheduleMmaResults() { } void HopperMultipleMatmulScheduler::scheduleEpilogue() { - // Load/cache the epilogue inputs if there are any. - auto& c_tvs = tensor_roles_.at(MatmulTensorRole::EPILOGUE_INPUT); std::vector cached_tvs; - for (auto* c : c_tvs) { - cached_tvs.push_back(c->cacheAfter()); - } + std::vector c_tvs; - // Propato to (not including) the splitk output if there is a splitk + // Propagate to (not including) the splitk output if there is a splitk // else this is just mma_results_ - std::vector propagate_to = splitk_sums_; + std::vector propagate_to = + splitk_sums_.empty() ? mma_results_ : splitk_sums_; if (tensor_roles_.count(MatmulTensorRole::EPILOGUE_INPUT)) { + auto& c_tvs = tensor_roles_.at(MatmulTensorRole::EPILOGUE_INPUT); + // Load/cache the epilogue inputs if there are any. + for (auto* c : c_tvs) { + cached_tvs.push_back(c->cacheAfter()); + } propagate_to.insert(propagate_to.end(), c_tvs.begin(), c_tvs.end()); } @@ -464,6 +466,7 @@ void HopperMultipleMatmulScheduler::scheduleEpilogue() { d->getLoopDomain()); d->setLoopDomain(s.as()); + // TODO: We need to check bank conflicts in this path. scheduler_utils::BoundedDirectionalTransformPropagator::backward( d, -1, @@ -476,9 +479,6 @@ void HopperMultipleMatmulScheduler::scheduleEpilogue() { // TODO: support vectorization_factor. d->axis(-1)->parallelize(ParallelType::Vectorize); scheduler_utils::parallelizeAllLike(d, -1, cached_tvs); - - // The cached EPILOGUE_INPUT tvs are not needed anymore - cached_tvs.clear(); } } else { constexpr int64_t stmatrix_tile_m = 16; From 09cb4079d5b37ca6ad3d11710300e585e08534ce Mon Sep 17 00:00:00 2001 From: protonu Date: Fri, 13 Dec 2024 12:57:38 -0800 Subject: [PATCH 5/5] fixing a bug --- csrc/scheduler/hopper_multi_matmul.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp index 98bf7cff37c..91048d3374c 100644 --- a/csrc/scheduler/hopper_multi_matmul.cpp +++ b/csrc/scheduler/hopper_multi_matmul.cpp @@ -436,7 +436,6 @@ void HopperMultipleMatmulScheduler::scheduleMmaResults() { void HopperMultipleMatmulScheduler::scheduleEpilogue() { std::vector cached_tvs; - std::vector c_tvs; // Propagate to (not including) the splitk output if there is a splitk // else this is just mma_results_ @@ -478,7 +477,9 @@ void HopperMultipleMatmulScheduler::scheduleEpilogue() { // inner-dim with extent 2. // TODO: support vectorization_factor. d->axis(-1)->parallelize(ParallelType::Vectorize); - scheduler_utils::parallelizeAllLike(d, -1, cached_tvs); + if (!cached_tvs.empty()) { + scheduler_utils::parallelizeAllLike(d, -1, cached_tvs); + } } } else { constexpr int64_t stmatrix_tile_m = 16;