From 5d1d07e341ed687eb75b2daaf3da14e47e9f998d Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 19 Dec 2024 12:54:09 -0800 Subject: [PATCH] Resolve conflicts by recomputation --- csrc/scheduler/resize.cpp | 61 ++++++-- csrc/scheduler/tools/resize_utils.cpp | 21 ++- csrc/scheduler/tools/resize_utils.h | 19 ++- tests/cpp/test_resize.cpp | 208 ++++++++++++++++++++------ 4 files changed, 243 insertions(+), 66 deletions(-) diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index 194087b90e8..dece9f79238 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -73,19 +73,6 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { auto resize_based_tensor_ops = ir_utils::getOpsOfType(fusion); - if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo( - resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT)); - !non_exclusive_resizes.empty()) { - std::stringstream msg; - msg << "Propagation of resizes would affect fusion outputs."; - for (const auto& [tv, resize_ids] : non_exclusive_resizes) { - msg << " Resize input tv: " << tv->toString() - << ", resize input ID groups: " << nvfuser::toString(resize_ids); - } - scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str()); - return false; - } - // Slicing of or to a broadcast ID is not allowed yet. for (auto tensor_op : resize_based_tensor_ops) { TensorView* out_tv = tensor_op->output(0)->as(); @@ -133,6 +120,30 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { return false; } + for (auto out_tv : ir_utils::filterByType(fusion->outputs())) { + if (out_tv == ref_tv) { + continue; + } + auto exprs = ValGraphBFS::getExprGroupsBetween( + broadcast_graph, + broadcast_graph.toGroups(ref_tv->getLogicalDomain()), + broadcast_graph.toGroups(out_tv->getLogicalDomain()), + /*require_all_to_visited=*/false) + .first; + for (const auto& [expr_g, dir] : exprs) { + if (expr_g->front()->isA()) { + std::stringstream msg; + msg << "Resize between reference and output not allowed."; + msg << " Reference: " << ref_tv->toString() + << ". Output: " << out_tv->toString() + << ". Resize: " << expr_g->front()->toString(); + scheduler_debug_utils::canScheduleRejectReason( + schedulerType(), msg.str()); + return false; + } + } + } + // Disable the scheduler if there's a squeeze op. The loop option // may also need to be enabled in that case, but that option is not // turned on automatically yet. @@ -163,6 +174,21 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { scheduler_utils::cacheInputs(fusion, true); scheduler_utils::cacheAndForkOutputs(fusion, true); + auto resize_based_tensor_ops = ir_utils::getOpsOfType(fusion); + + IdModel id_model(fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + + // Replicate resize inputs if necessary to avoid conflicting propagations + for (const auto& [out_tv, exlusivity_info] : + scheduler_tools::getNonExclusiveResizeInfo( + resize_based_tensor_ops, exact_graph)) { + auto resize_based_op = out_tv->definition(); + auto inp_tv = resize_based_op->input(0)->as(); + auto inp_tv_copy = RecomputeTv::recompute(inp_tv); + ir_utils::replaceValInExprInputs(resize_based_op, inp_tv, inp_tv_copy); + } + for (auto expr : fusion->exprs()) { if (!expr->isOneOf()) { continue; @@ -186,9 +212,14 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { ref_tv->axis(-1)->parallelize(ParallelType::TIDx); ref_tv->axis(-2)->parallelize(ParallelType::BIDx); - // Propagate the reference to the other tensors + // Propagate the reference to the other tensors. Note that the + // update flag is enabled so to workaround the resize propagation + // issue. This may not work if there's a tensor that is reshaped + // from the reference tensor, but that should not be the case as the + // reference is picked by the same routine used for the pointwise + // scheduler. scheduler_tools::scheduleLoopDomainsLike( - fusion->allTvs(), ref_tv->getLoopDomain()); + fusion->allTvs(), ref_tv->getLoopDomain(), true); inlineMost(); diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp index f1206f99676..d7987b1b412 100644 --- a/csrc/scheduler/tools/resize_utils.cpp +++ b/csrc/scheduler/tools/resize_utils.cpp @@ -66,13 +66,14 @@ void propagateResizeToInputs(Expr* resize_tensor_op) { } } -std::unordered_map getNonExclusiveResizeInfo( +std::unordered_map getNonExclusiveResizeInfo( const std::vector& ordered_resize_tensor_ops, - const ValGraph& exact_graph) { + const ValGraph& exact_graph, + bool ignore_fusion_inputs) { NVF_ERROR(!ordered_resize_tensor_ops.empty()); Fusion* fusion = ordered_resize_tensor_ops[0]->fusion(); - std::unordered_map non_exclusive_resizes; + std::unordered_map non_exclusive_resizes; std::unordered_set inputs{ fusion->inputs().begin(), fusion->inputs().end()}; @@ -95,6 +96,8 @@ std::unordered_map getNonExclusiveResizeInfo( auto inp_tv = dynamic_cast(resize_tensor_op->inputs().at(0)); auto out_tv = dynamic_cast(resize_tensor_op->outputs().at(0)); + ResizeExclusivityInfo info; + ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv); NVF_ERROR(!resize_inp_ids.empty()); @@ -107,6 +110,10 @@ std::unordered_map getNonExclusiveResizeInfo( // visible changes through the tensor, the resize is considered // non-exclusive. for (auto dep_tv : ir_utils::filterByType(dep_vals)) { + if (ignore_fusion_inputs && dep_tv->isFusionInput()) { + continue; + } + bool maybe_non_exclusive = false; if (dep_tv->isFusionOutput()) { @@ -159,10 +166,16 @@ std::unordered_map getNonExclusiveResizeInfo( } // This resize input ID is not exclusively used - non_exclusive_resizes[inp_tv].pushBack(resize_inp_id); + // non_exclusive_resizes[inp_tv].first.pushBack(resize_inp_id); + info.shared_tvs.push_back(dep_tv); + info.resized_ids.pushBack(resize_inp_id); } } + if (!info.shared_tvs.empty()) { + NVF_ERROR(non_exclusive_resizes.emplace(out_tv, info).second); + } + // Analysis of exclusiveness until in_tv is done. Following // resize-based tensor ops do not need to check the same section // of the fusion and can start from out_tv. diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h index 1245f166c65..8b3667c5dd3 100644 --- a/csrc/scheduler/tools/resize_utils.h +++ b/csrc/scheduler/tools/resize_utils.h @@ -94,9 +94,24 @@ void propagateResizeToInputs(Expr* resize_op); // The function returns a map from tensors that are input to // non-exclusive ops to their resize input ID groups. This map will be // used to resolve the non-exclusiveness by replication. -std::unordered_map getNonExclusiveResizeInfo( +struct ResizeExclusivityInfo { + std::vector shared_tvs; + // std::unordered_map resized_ids; + ValGroups resized_ids; + + bool operator==(const ResizeExclusivityInfo& other) const { + return shared_tvs == other.shared_tvs && resized_ids == other.resized_ids; + } + + bool operator!=(const ResizeExclusivityInfo& other) const { + return !(*this == other); + } +}; + +std::unordered_map getNonExclusiveResizeInfo( const std::vector& ordered_resize_tensor_ops, - const ValGraph& exact_graph); + const ValGraph& exact_graph, + bool ignore_fusion_inputs = false); } // namespace scheduler_tools } // namespace nvfuser diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 1c3ca839baa..7d4864b0d55 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4427,25 +4427,83 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) { fusion.addOutput(tv3); fusion.addOutput(tv6); - IdModel id_model(&fusion, /*build_graphs=*/false); - const auto& exact_graph = id_model.buildExactGraph(); + { + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); - auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( - ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.size(), 2); - // tv1 is the input of the first slice, which is not exclusive as - // tv1 is also a producer of tv4. - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); + // tv2 is the output of the first slice, which is not exclusive as + // tv1 is also a producer of tv4. + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv1}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + + // Similary, tv5 is the output of the second slice, which is not exclusive + // as tv1 is also a producer of tv2. + EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1); + scheduler_tools::ResizeExclusivityInfo tv5_info{ + {tv1}, exact_graph.toGroups(std::vector{tv4->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info); + } - // Similary, tv4 is the input of the second slice, which is not exclusive as - // tv1 is also a producer of tv2. - EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv4), - exact_graph.toGroups(std::vector{tv4->axis(1)})); + // Test replication-based mitigation of conflicts + { + Fusion fusion_copy = fusion; + FusionGuard fg(&fusion_copy); + + auto tv0 = fusion_copy.inputs().at(0)->as(); + auto tv2 = + fusion_copy.outputs().at(0)->definition()->input(0)->as(); + auto slice = dynamic_cast(tv2->definition()); + ASSERT_NE(slice, nullptr); + auto tv1 = slice->input(0)->as(); + auto tv5 = + fusion_copy.outputs().at(1)->definition()->input(0)->as(); + auto tv4 = tv5->definition()->input(0)->as(); + + // Replicate tv1 for tv2 + auto private_copy = RecomputeTv::recompute(tv1); + ir_utils::replaceValInExprInputs(slice, tv1, private_copy); + + // The two slices should still be reported as non-exclusive but they + // both are shared at the fusion input. + IdModel id_model(&fusion_copy, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion_copy), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.size(), 2); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv0}, exact_graph.toGroups(std::vector{tv0->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + + EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1); + scheduler_tools::ResizeExclusivityInfo tv5_info{ + {tv0}, exact_graph.toGroups(std::vector{tv4->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({16, 100}, options); + std::vector inputs({t0}); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + EXPECT_FALSE(runtime->isSegmented()); + const auto& heuristic_param = + runtime->schedulerHeuristics()->heuristicsList().front(); + EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); + Fusion* scheduled_fusion = + dynamic_cast(runtime->executors().at(0).get())->fusion(); + checkLoopDomainEquivalence( + scheduled_fusion->outputs().at(0)->as()); } // Non-exclusive slice due to a dependency to a fusion output @@ -4486,12 +4544,57 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) { auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); - // tv3 is the input of the slice, which is not exclusive as + // tv4 is the input of the slice, which is not exclusive as // tv3 depends on tv2, which is a fusion output - EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv3), - exact_graph.toGroups(std::vector{tv3->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv2}, exact_graph.toGroups(std::vector{tv3->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); + + // Test replication-based mitigation of conflicts + { + Fusion fusion_copy = fusion; + FusionGuard fg(&fusion_copy); + + auto tv0 = fusion_copy.inputs().at(0)->as(); + auto tv5 = fusion_copy.outputs().at(1)->as(); + auto tv4 = tv5->definition()->input(0)->as(); + auto tv3 = tv4->definition()->input(0)->as(); + + auto private_copy = RecomputeTv::recompute(tv3); + ir_utils::replaceValInExprInputs(tv4->definition(), tv3, private_copy); + + IdModel id_model(&fusion_copy, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion_copy), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.size(), 1); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv0}, exact_graph.toGroups(std::vector{tv0->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); + } + + GTEST_SKIP() << "Scheduling not yet supported due to broadcast"; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({16, 100}, options); + auto t1 = at::randn({16}, options); + std::vector inputs({t0, t1}); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + EXPECT_FALSE(runtime->isSegmented()); + const auto& heuristic_param = + runtime->schedulerHeuristics()->heuristicsList().front(); + EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); + Fusion* scheduled_fusion = + dynamic_cast(runtime->executors().at(0).get())->fusion(); + checkLoopDomainEquivalence( + scheduled_fusion->outputs().at(0)->as()); } // Slice input tensor depends on a fusion output, but the slice is @@ -4671,10 +4774,29 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs6) { auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); EXPECT_EQ(non_exclusive_resize_info.size(), 1); - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv1}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + + // When scheduled, since the shape of the tv4 is different from the + // shape of tv5, this fusion is segmented. One segment is a resize + // segment consisting of tv2 and tv3 slices. Another is a pointwise + // segment for tv5. + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + const auto& heuristic_list = runtime->schedulerHeuristics()->heuristicsList(); + EXPECT_EQ(heuristic_list.size(), 2); + // They should be a combination of a resize scheduler and a pointwise + // scheduler + EXPECT_TRUE( + (heuristic_list[0]->scheduler_type == SchedulerType::PointWise && + heuristic_list[1]->scheduler_type == SchedulerType::Resize) || + (heuristic_list[0]->scheduler_type == SchedulerType::Resize && + heuristic_list[1]->scheduler_type == SchedulerType::PointWise)); } // RoPE-like rotation patten @@ -4774,19 +4896,17 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) { const auto& exact_graph = id_model.buildExactGraph(); auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); - EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv3), - exact_graph.toGroups(std::vector{tv3->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv0}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv0}, exact_graph.toGroups(std::vector{tv3->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); // These two entries should be all the info map has. EXPECT_EQ(non_exclusive_resize_info.size(), 2); - GTEST_SKIP() << "Scheduling not yet supported"; - FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto out_tensors = executor_cache.runFusionWithInputs(inputs); testValidate( @@ -4914,19 +5034,17 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) { const auto& exact_graph = id_model.buildExactGraph(); auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); - EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv3), - exact_graph.toGroups(std::vector{tv3->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv0}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv0}, exact_graph.toGroups(std::vector{tv3->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); // These two entries should be all the info map has. EXPECT_EQ(non_exclusive_resize_info.size(), 2); - GTEST_SKIP() << "Scheduling not yet supported"; - FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto out_tensors = executor_cache.runFusionWithInputs(inputs); testValidate(