From 11f5dce8ac3ff31fd1ca0e40275f17ef8ad77724 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 10 Dec 2024 17:35:28 -0800 Subject: [PATCH 1/4] Always enable IdModel-based indexing when resize is used --- csrc/device_lower/lower2device.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 79652bc67c5..9d39040f4be 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -348,6 +348,12 @@ IdModelOptions getIdModelOptions(Fusion* fusion) { } else if (expr->isA()) { options.setBuildTensorIndexer(true); continue; + } else if (expr->isOneOf()) { + options.setProducerIndex(true); + options.setConsumerIndex(true); + options.setInlinePredicate(true); + options.setUnswitchPredicate(true); + continue; } else if (auto reshape = dynamic_cast(expr)) { // The legacy indexer has an issue when an expand broadcast is // involved in reshape transformations. Enable both tensor and From 05ea88f78b3a4813d09482ef09017637a8da557e Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 10 Dec 2024 17:44:18 -0800 Subject: [PATCH 2/4] Don't run the tests without IdModel --- tests/cpp/test_resize.cpp | 184 ++++++++++---------------------------- 1 file changed, 45 insertions(+), 139 deletions(-) diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 7d9e357d18b..40dff5237b1 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -55,7 +55,7 @@ void checkLoopDomainEquivalence( } // namespace -using ResizeTest = NVFuserFixtureParamTest; +using ResizeTest = NVFuserTest; using testing::Each; using testing::HasSubstr; @@ -64,14 +64,8 @@ using testing::Property; using testing::ThrowsMessage; using testing::UnorderedElementsAre; -INSTANTIATE_TEST_SUITE_P( - , - ResizeTest, - testing::Bool(), - testing::PrintToStringParamName()); - // Simple pad test -TEST_P(ResizeTest, Pad1) { +TEST_F(ResizeTest, Pad1) { Fusion fusion; FusionGuard fg(&fusion); @@ -89,11 +83,7 @@ TEST_P(ResizeTest, Pad1) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -105,7 +95,7 @@ TEST_P(ResizeTest, Pad1) { } // pad + split -TEST_P(ResizeTest, Pad2) { +TEST_F(ResizeTest, Pad2) { Fusion fusion; FusionGuard fg(&fusion); @@ -125,11 +115,7 @@ TEST_P(ResizeTest, Pad2) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -141,7 +127,7 @@ TEST_P(ResizeTest, Pad2) { } // pad, merge + split, inlineMost -TEST_P(ResizeTest, Pad3) { +TEST_F(ResizeTest, Pad3) { Fusion fusion; FusionGuard fg(&fusion); @@ -178,11 +164,7 @@ TEST_P(ResizeTest, Pad3) { std::vector aten_inputs({t0, t1}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -192,7 +174,7 @@ TEST_P(ResizeTest, Pad3) { } // pad + parallelization -TEST_P(ResizeTest, Pad4) { +TEST_F(ResizeTest, Pad4) { Fusion fusion; FusionGuard fg(&fusion); @@ -212,11 +194,7 @@ TEST_P(ResizeTest, Pad4) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -228,7 +206,7 @@ TEST_P(ResizeTest, Pad4) { } // pad + parallelization + RAW sync -TEST_P(ResizeTest, Pad5) { +TEST_F(ResizeTest, Pad5) { Fusion fusion; FusionGuard fg(&fusion); @@ -267,11 +245,7 @@ TEST_P(ResizeTest, Pad5) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -283,7 +257,7 @@ TEST_P(ResizeTest, Pad5) { } // pad + merge + split parallelization -TEST_P(ResizeTest, Pad6) { +TEST_F(ResizeTest, Pad6) { Fusion fusion; FusionGuard fg(&fusion); @@ -318,11 +292,7 @@ TEST_P(ResizeTest, Pad6) { std::vector aten_inputs({t0, t1}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -333,7 +303,7 @@ TEST_P(ResizeTest, Pad6) { // pad + unswitch. Having different extents in an unswitched loop nest // needs a special care (see UnrollPass::canOmitElseClause) -TEST_P(ResizeTest, Pad7) { +TEST_F(ResizeTest, Pad7) { Fusion fusion; FusionGuard fg(&fusion); @@ -369,11 +339,7 @@ TEST_P(ResizeTest, Pad7) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -430,7 +396,7 @@ TEST_F(ResizeTest, Pad8) { } #endif -TEST_P(ResizeTest, PadScheduler1) { +TEST_F(ResizeTest, PadScheduler1) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -448,11 +414,7 @@ TEST_P(ResizeTest, PadScheduler1) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -462,7 +424,7 @@ TEST_P(ResizeTest, PadScheduler1) { NVF_CHECK(ref.equal(cg_outputs[0])); } -TEST_P(ResizeTest, PadScheduler2) { +TEST_F(ResizeTest, PadScheduler2) { auto fusion_ptr = std::make_unique(); auto& fusion = *fusion_ptr; FusionGuard fg(fusion_ptr.get()); @@ -487,11 +449,7 @@ TEST_P(ResizeTest, PadScheduler2) { std::vector aten_inputs({t0, t1}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -540,7 +498,7 @@ TEST_F(ResizeTest, PadScheduler3) { // Two pad exprs, both using the same symbolic pad widths, segmented // into two kernels. Make sure the symbolic inputs are available to // both of the segmented kernels. -TEST_P(ResizeTest, PadScheduler4) { +TEST_F(ResizeTest, PadScheduler4) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -569,11 +527,7 @@ TEST_P(ResizeTest, PadScheduler4) { std::vector aten_inputs({t0, 1, 1}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -584,7 +538,7 @@ TEST_P(ResizeTest, PadScheduler4) { // Pad a broadcast // See https://github.com/NVIDIA/Fuser/issues/798 -TEST_P(ResizeTest, PadBroadcastInput) { +TEST_F(ResizeTest, PadBroadcastInput) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -609,11 +563,7 @@ TEST_P(ResizeTest, PadBroadcastInput) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -1437,7 +1387,7 @@ TEST_F(ResizeTest, SliceExtentSimplification) { << "Unexpected resize output extent: " << resize_extent->toInlineString(); } -TEST_P(ResizeTest, PadReduceScheduler1) { +TEST_F(ResizeTest, PadReduceScheduler1) { auto fusion_ptr = std::make_unique(); auto& fusion = *fusion_ptr; FusionGuard fg(fusion_ptr.get()); @@ -1472,11 +1422,7 @@ TEST_P(ResizeTest, PadReduceScheduler1) { [](auto pad_extent) { return pad_extent; }); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -1761,7 +1707,7 @@ TEST_F(ResizeTest, SoftmaxSliceScheduler2) { } // Same as Pad1 but pad by specified value -TEST_P(ResizeTest, PadWithValue) { +TEST_F(ResizeTest, PadWithValue) { Fusion fusion; FusionGuard fg(&fusion); @@ -1782,11 +1728,7 @@ TEST_P(ResizeTest, PadWithValue) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -1798,7 +1740,7 @@ TEST_P(ResizeTest, PadWithValue) { } // Same as Pad1 but pad by negative value to create an empty tensor -TEST_P(ResizeTest, PadToEmptyTensor) { +TEST_F(ResizeTest, PadToEmptyTensor) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -1821,11 +1763,7 @@ TEST_P(ResizeTest, PadToEmptyTensor) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -1836,7 +1774,7 @@ TEST_P(ResizeTest, PadToEmptyTensor) { } // Test that padding Half tensor by Double does not promote output -TEST_P(ResizeTest, PadHalfWithDoubleValue) { +TEST_F(ResizeTest, PadHalfWithDoubleValue) { Fusion fusion; FusionGuard fg(&fusion); @@ -1857,11 +1795,7 @@ TEST_P(ResizeTest, PadHalfWithDoubleValue) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -2421,7 +2355,7 @@ TEST_F(ResizeTest, SliceVectorization) { // Concretize a symbolic pad that results in a broadcast (static pads) // In this test, the sizes and pad widths are static, so there should be nothing // to concretize. -TEST_P(ResizeTest, ResizePadToBroadcastStatic) { +TEST_F(ResizeTest, ResizePadToBroadcastStatic) { std::vector t0_size = {2, 3, 2, 5, 6}; std::vector t1_size = {2, 4, 4, 3, 5}; // Note there are only 8 input scalars for 5D input. Implicit no-pad of dim 0 @@ -2471,11 +2405,7 @@ TEST_P(ResizeTest, ResizePadToBroadcastStatic) { std::vector aten_inputs({t0, t1}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -2495,7 +2425,7 @@ TEST_P(ResizeTest, ResizePadToBroadcastStatic) { } // Concretize a symbolic pad that results in a broadcast (dynamic pads) -TEST_P(ResizeTest, ResizePadToBroadcastDynamic) { +TEST_F(ResizeTest, ResizePadToBroadcastDynamic) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -2543,11 +2473,7 @@ TEST_P(ResizeTest, ResizePadToBroadcastDynamic) { aten_inputs.insert(aten_inputs.end(), pad_widths.begin(), pad_widths.end()); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -2569,7 +2495,7 @@ TEST_P(ResizeTest, ResizePadToBroadcastDynamic) { } // See https://github.com/NVIDIA/Fuser/issues/596 -TEST_P(ResizeTest, ResizePadToBroadcastIssue596) { +TEST_F(ResizeTest, ResizePadToBroadcastIssue596) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -2592,11 +2518,7 @@ TEST_P(ResizeTest, ResizePadToBroadcastIssue596) { std::vector aten_inputs({t0, t1}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); auto args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); FusionKernelRuntime runtime(std::move(fusion), args); @@ -3091,7 +3013,7 @@ TEST_F(ResizeTest, SliceAndReshapeRepro540Manual) { // Test concretizing a pad that follows a reshape. This requires the // ExpressionEvaluator used in concretization to propagate shapes properly // across symbolic reshapes in order to infer the size of the downstream pad. -TEST_P(ResizeTest, ReshapeToPad) { +TEST_F(ResizeTest, ReshapeToPad) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -3113,11 +3035,7 @@ TEST_P(ResizeTest, ReshapeToPad) { fusion.addOutput(tv2); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion_ptr)); @@ -3262,7 +3180,7 @@ TEST_F(ResizeTest, CatOfExpandedBroadcast) { // padded in the empty dim as well as the expanded dims. // This should match test_python_frontend.py::test_pad_expanded_empty // See https://github.com/NVIDIA/Fuser/issues/870 -TEST_P(ResizeTest, PadExpandedEmpty) { +TEST_F(ResizeTest, PadExpandedEmpty) { auto fusion_ptr = std::make_unique(); auto& fusion = *fusion_ptr; FusionGuard fg(&fusion); @@ -3296,11 +3214,7 @@ TEST_P(ResizeTest, PadExpandedEmpty) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); @@ -3311,7 +3225,7 @@ TEST_P(ResizeTest, PadExpandedEmpty) { // Test that we can pad properly along broadcast dims // See https://github.com/NVIDIA/Fuser/issues/868 -TEST_P(ResizeTest, PadOfBroadcast) { +TEST_F(ResizeTest, PadOfBroadcast) { Fusion fusion; FusionGuard fg(&fusion); @@ -3329,11 +3243,7 @@ TEST_P(ResizeTest, PadOfBroadcast) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); @@ -3344,7 +3254,7 @@ TEST_P(ResizeTest, PadOfBroadcast) { // Test that we can cat along broadcast dims that have been expanded // See https://github.com/NVIDIA/Fuser/issues/868 -TEST_P(ResizeTest, PadOfExpandedBroadcast) { +TEST_F(ResizeTest, PadOfExpandedBroadcast) { Fusion fusion; FusionGuard fg(&fusion); @@ -3365,11 +3275,7 @@ TEST_P(ResizeTest, PadOfExpandedBroadcast) { std::vector aten_inputs({t0}); EnableOptionsGuard enable_options_guard; - if (GetParam()) { - EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - } else { - EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); - } + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; ke.compile(&fusion, aten_inputs); From 0ad9fea072903490ab308813c9e1f7eb81a2d4a7 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 10 Dec 2024 22:11:16 -0800 Subject: [PATCH 3/4] fix --- csrc/id_model/indexing.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index b505a2b6ef7..fdf7aa9c85b 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -845,14 +845,18 @@ std::vector TensorIndexer::getIndexFor( const auto& replacement_map = getIndexReplacementMap( expr, as_consumer, info.loop_domains, for_loops, info.index_map); - const auto index_groups = traversalGraph().toGroups(index_ids); + // Note that IDs of index_ids may be mapped as the traversal graph + // is the AlmostExact graph. std::vector result; - result.reserve(index_groups.size()); - for (const auto& g : index_groups) { - auto it = info.index_map.find(g); + result.reserve(index_ids.size()); + for (IterDomain* index_id : index_ids) { + const auto& index_group = traversalGraph().toGroup(index_id); + auto it = info.index_map.find(index_group); NVF_ERROR( - it != info.index_map.end(), "Index not found for ", g->toString()); + it != info.index_map.end(), + "Index not found for ", + index_id->toString()); result.push_back( ir_utils::replaceValRecursively(it->second, replacement_map)); } From 4f1498828bac1d10eb865eadeb301e3266e53822 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 11 Dec 2024 01:16:18 -0800 Subject: [PATCH 4/4] Allocation ordering fix --- csrc/id_model/indexing.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index b505a2b6ef7..46edcfe8367 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -413,10 +413,11 @@ class AllocationDomainSetup : private kir::IrVisitor { } // Reorder non-logical allocation domains to follow the ordering of - // the logical domain. This is necessary when an allocation domain - // includes a vectorized loop iter domain since it must be at the + // the set allocation domain. This is necessary when an allocation + // domain includes a vectorized loop iter domain since it must be at the // innermost position but that may not be the case in the loop - // domain. Not strictly necessary otherwise, but this should also + // domain. It is also necessary when the tensor is a producer of a + // vectorized store. Not strictly necessary otherwise, but this should also // minimize the deviation from the old indexing scheme which always // uses the logical domain to index. // @@ -424,8 +425,17 @@ class AllocationDomainSetup : private kir::IrVisitor { std::optional> reorderAllocationDomains( const TensorView* tv, const std::vector& allocation_domains) const { + // Use getMaybeAllocationDomain instead of getLogicalDomain. When + // this tv is a producer of a vectorized store, the consumer + // tensor shoud be a global memory tensor and this is likely a + // cache tensor created by cacheBefore. The consumer tensor may + // have a reordered allocation domain and that dictates the actual + // allocation ordering of this producer local tensor as well. If + // getLogicalDomain is used, DistributedTransformerTest.Backward + // fails at the result validation. auto exprs = DependencyCheck::getAllExprsBetween( - {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}, + {tv->getMaybeAllocationDomain().begin(), + tv->getMaybeAllocationDomain().end()}, {allocation_domains.begin(), allocation_domains.end()}); if (exprs.empty()) { @@ -434,7 +444,7 @@ class AllocationDomainSetup : private kir::IrVisitor { // Replay exprs from the logical domain to get the non-reordered // domains - auto ordered_domains = tv->getLogicalDomain(); + auto ordered_domains = tv->getMaybeAllocationDomain(); for (auto expr : exprs) { // Find the position to insert the outputs. int64_t insertion_pos = -1;