diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h index d39c76ca5cf..cda0c1a8f40 100644 --- a/csrc/ir/interface_nodes.h +++ b/csrc/ir/interface_nodes.h @@ -190,6 +190,10 @@ class NVF_API TensorView : public Val { return domain()->loop(); }; + const std::vector& getInitialLoopDomain() const { + return domain()->initialLoop(); + }; + // If allocation domain exists in domain() return it, otherwise return // logical domain const std::vector& getMaybeAllocationDomain() const { diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h index 78df93d0bd6..baccd384b6a 100644 --- a/csrc/ir/internal_base_nodes.h +++ b/csrc/ir/internal_base_nodes.h @@ -568,11 +568,21 @@ class TensorDomain : public Val { return loop_domain_; } + const std::vector& initialLoop() const { + return initial_loop_domain_; + } + // Check if id is a loop ID. bool isLoop(const IterDomain* id) const { return std::find(loop().begin(), loop().end(), id) != loop().end(); } + // Check if id is an intial loop ID. + bool isInitialLoop(const IterDomain* id) const { + return std::find(initialLoop().begin(), initialLoop().end(), id) != + loop().end(); + } + // Get all IDs that is on the shortest path between any of the domains // (logical domain, root domain, loop domain, allocation domain) following // definition and uses path. Return values are topologically ordered and @@ -695,6 +705,10 @@ class TensorDomain : public Val { const std::vector logical_domain_; std::vector allocation_domain_; std::vector loop_domain_; + // Initial loop domain. Loop domain is updated with transformations + // such as split, but the initial loop domain can only change with + // setLoopDomain + std::vector initial_loop_domain_; std::vector additional_ids_; std::vector no_bcast_domain_; diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index d5dcb56beb7..20daba19622 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -3044,6 +3044,7 @@ TensorDomain::TensorDomain( logical_domain_(std::move(logical_domain)), allocation_domain_(std::move(allocation_domain)), loop_domain_(std::move(loop_domain)), + initial_loop_domain_(loop_domain_), contiguity_( contiguity.empty() ? getContiguityFilledWith(maybeAllocation(), false) : std::move(contiguity)) { @@ -3073,6 +3074,7 @@ TensorDomain::TensorDomain(IrBuilderPasskey passkey, const TensorDomain* src) logical_domain_(src->logical_domain_), allocation_domain_(src->allocation_domain_), loop_domain_(src->loop_domain_), + initial_loop_domain_(src->initial_loop_domain_), additional_ids_(src->additional_ids_), no_bcast_domain_(src->no_bcast_domain_), no_reduction_domain_(src->no_reduction_domain_), @@ -3085,6 +3087,7 @@ TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner) logical_domain_(ir_cloner->clone(src->logical_domain_)), allocation_domain_(ir_cloner->clone(src->allocation_domain_)), loop_domain_(ir_cloner->clone(src->loop_domain_)), + initial_loop_domain_(ir_cloner->clone(src->initial_loop_domain_)), additional_ids_(ir_cloner->clone(src->additional_ids_)), no_bcast_domain_(ir_cloner->clone(src->no_bcast_domain_)), no_reduction_domain_(ir_cloner->clone(src->no_reduction_domain_)), @@ -3614,6 +3617,7 @@ void TensorDomain::setLoopDomain(std::vector new_loop_domain) { ". Logical: ", toDelimitedString(logical_domain_)); loop_domain_ = std::move(new_loop_domain); + initial_loop_domain_ = loop_domain_; resetDomains(); } @@ -3630,17 +3634,11 @@ void TensorDomain::setAllocationDomain( } std::vector TensorDomain::allIDs() const { - // loop_domain_ must be the first domain since loop domains are - // allowed to have extra domains that may not exist in other - // domains and IRBFS::getExprsBetween is not symmetric with respect - // to its two domain parameters. For example, it can find all exprs - // from a loop domain to a logical domain but may miss from logical - // to loop. See NVFuserTest.AllIDsWithExtraLoopIDs for a concrete - // example. - std::array*, 5> all_domains = { - &loop_domain_, + std::array*, 6> all_domains = { &logical_domain_, &root_domain_, + &initial_loop_domain_, + &loop_domain_, &allocation_domain_, &additional_ids_}; VectorOfUniqueEntries discovered_ids; diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 16cec9ee9dd..cf7c029ffa2 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -6513,7 +6513,7 @@ TEST_F(NVFuserTest, CompareLogicalAndLoopDomains) { "Not all logical IDs are covered by loop domain"))); } -TEST_F(NVFuserTest, AllIDsWithExtraLoopIDs) { +TEST_F(NVFuserTest, AllIDsWithExtraLoopIDs1) { Fusion fusion; FusionGuard fg(&fusion); @@ -6582,6 +6582,86 @@ TEST_F(NVFuserTest, AllIDsWithExtraLoopIDs) { EXPECT_EQ(tv2_all_id_set, tv2_all_ids_ref); } +TEST_F(NVFuserTest, AllIDsWithExtraLoopIDs2) { + Fusion fusion; + FusionGuard fg(&fusion); + + // [i0, i1] + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + // [i0] + auto tv1 = makeSymbolicTensor(1); + fusion.addInput(tv1); + + // [i0] + auto tv2 = set(tv1); + // [i0, b1] + auto tv3 = broadcast(tv2, {false, true}); + // [i0, i1] + auto tv4 = add(tv0, tv3); + fusion.addOutput(tv4); + + // Set the loop domain of tv2 the same as tv4. The new loop domain + // includes an ID that is not reachable from tv2 logical domain + auto tv2_inner_loop_domain = + tv4->getLoopDomain().at(1)->cloneWithoutRFactor(); + std::vector tv2_initial_loop_domain{ + tv2->getLogicalDomain().at(0), tv2_inner_loop_domain}; + tv2->setLoopDomain(tv2_initial_loop_domain); + + // Schedule only the extra dommain + tv2->split(1, 4); + auto tv2_split = tv2->axis(1)->definition(); + + // tv2 logical: [i0] + // split(i1) -> i1/4, 4 + // tv2 loop: [i0, i1/4, 4] + // + // All IDs: [i0, i1, i1/4, 4] + + EXPECT_EQ(tv2->getInitialLoopDomain(), tv2_initial_loop_domain); + + // Because the split only uses the extra ID, getExprsBetween from + // the loop domain to the logical domain does not traverse the + // split, just returning an empty vector. + EXPECT_TRUE( + IRBFS::getExprsBetween( + {tv2->getLoopDomain().begin(), tv2->getLoopDomain().end()}, + {tv2->getLogicalDomain().begin(), tv2->getLogicalDomain().end()}, + false) + .empty()); + + // From the initial loop to the current loop should find the split expr + auto exprs_between = IRBFS::getExprsBetween( + {tv2->getInitialLoopDomain().begin(), tv2->getInitialLoopDomain().end()}, + {tv2->getLoopDomain().begin(), tv2->getLoopDomain().end()}, + false); + EXPECT_EQ(exprs_between.size(), 1); + EXPECT_EQ(exprs_between.front().first, tv2_split); + + // The initial loop domain and the current loop domain should be + // reachable to each other with no redundancy + auto tv2_loop_domain_comparison_results = ir_utils::compareDomains( + tv2->getInitialLoopDomain(), tv2->getLoopDomain()); + EXPECT_FALSE(tv2_loop_domain_comparison_results.dom0_has_unreachable_ids); + EXPECT_FALSE(tv2_loop_domain_comparison_results.dom1_has_unreachable_ids); + + // Make sure allIDs finds all the IDs including the extra IDs + std::unordered_set tv2_all_ids_ref; + tv2_all_ids_ref.insert( + tv2->getLogicalDomain().begin(), tv2->getLogicalDomain().end()); + tv2_all_ids_ref.insert( + tv2->getInitialLoopDomain().begin(), tv2->getInitialLoopDomain().end()); + tv2_all_ids_ref.insert( + tv2->getLoopDomain().begin(), tv2->getLoopDomain().end()); + + auto tv2_all_ids = tv2->domain()->allIDs(); + std::unordered_set tv2_all_id_set( + tv2_all_ids.begin(), tv2_all_ids.end()); + + EXPECT_EQ(tv2_all_id_set, tv2_all_ids_ref); +} + // Repro for issue #236 (https://github.com/NVIDIA/Fuser/issues/236) TEST_F(NVFuserTest, DoublePrecisionNorm_CUDA) { auto fusion = std::make_unique();