Revert "Allow output to alias intermediate tensor, step-2 to solve gr…

…oup norm segmentation issue #2375 (#2405)" This reverts commit 15bdf9f.
NVIDIA · Jul 21, 2024 · 7259833 · 7259833
1 parent 59a951f
commit 7259833
Show file tree

Hide file tree

Showing 8 changed files with 18 additions and 204 deletions.
diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp
@@ -9,7 +9,6 @@
 #include <vector>
 
 #include <alias_analysis.h>
-#include <compute_at_map.h>
 #include <dispatch.h>
 #include <fusion.h>
 #include <ir/interface_nodes.h>
@@ -18,8 +17,6 @@
 #include <ir/utils.h>
 #include <linked_hash_map.h>
 #include <logical_domain_map.h>
-#include <scheduler/reduction_utils.h>
-#include <scheduler/registry_utils.h>
 
 namespace nvfuser {
 
@@ -428,69 +425,18 @@ bool okToRelayout(
                                             : tv->getMaybeAllocationDomain());
   return new_layout.isCompliantWith({allocation, tv->getContiguity()});
 }
-
-// Returns true if the output tv's definition op may cause segmentation.
-// Only considered view op interfering with reduction.
-// TODO: other ops?
-bool outputInterferingReduction(Fusion* fusion) {
-  // output must be defined as view op
-  const auto& output_tvs =
-      ir_utils::filterByType<TensorView>(fusion->outputs());
-  if (std::none_of(
-          output_tvs.begin(), output_tvs.end(), [](TensorView* out_tv) {
-            return out_tv->definition()->isA<ViewOp>();
-          })) {
-    return false;
-  }
-
-  // fusion must have reduction tvs
-  const auto& reduction_tvs = scheduler_utils::getReductionTvs(fusion);
-  if (reduction_tvs.empty()) {
-    return false;
-  }
-
-  ComputeAtMap ca_map(fusion);
-  if (registry_utils::requiresForwardViewReplay(fusion, ca_map)) {
-    return true;
-  }
-
-  // check if view op interferes with reduction
-  auto ref_redu_tv =
-      reduction_scheduler_utils::getRepresentativeReductionTv(reduction_tvs);
-  if (registry_utils::reductionInterferingView(fusion, ca_map, ref_redu_tv)) {
-    return true;
-  }
-
-  return false;
-}
-
-// Stop at view op
-// TODO: other ops?
-bool isOpsToStop(const Expr* expr, bool stop_at_view) {
-  return stop_at_view && expr->isA<ViewOp>();
-}
-
 } // namespace
 
 void AliasAnalysisResult::finalize(
-    Fusion* fusion,
-    const bool can_override_empty_allocation_domain,
-    const bool may_alias_intermediate) {
-  // If allow output to alias intermediate, then we don't need to walk up
-  // the chain to find an input or output root. It changes the last reshape
-  // in group norm to a no-op.
-  bool stop_at_view =
-      may_alias_intermediate && outputInterferingReduction(fusion);
+    const bool can_override_empty_allocation_domain) {
   for (auto [alias, source_and_layout] : alias_to_source_) {
     auto [root, preferred_layout] = source_and_layout;
-    if (!isOpsToStop(alias->definition(), stop_at_view)) {
-      while (root != nullptr && !root->isFusionInput() &&
-             !root->isFusionOutput()) {
-        const auto i = alias_to_source_.find(root);
-        root = (i == alias_to_source_.end() ? nullptr : i->second.first);
-      }
+    // Walks up the `alias_to_source_` chain.
+    while (root != nullptr && !root->isFusionInput() &&
+           !root->isFusionOutput()) {
+      const auto i = alias_to_source_.find(root);
+      root = (i == alias_to_source_.end() ? nullptr : i->second.first);
     }
-
     if (root == nullptr) {
       continue;
     }
@@ -541,8 +487,7 @@ std::string AliasAnalysisResult::toString(const int indent_size) const {
 
 AliasAnalysisResult findAliases(
     Fusion* fusion,
-    const bool can_override_empty_allocation_domain,
-    const bool may_alias_intermediate) {
+    const bool can_override_empty_allocation_domain) {
   AliasAnalysisResult analysis;
   AliasFinder finder(analysis);
   // Fusion::exprs() computes and returns topological order.
@@ -554,9 +499,7 @@ AliasAnalysisResult findAliases(
     // results).
     finder.dispatch(expr);
   }
-
-  analysis.finalize(
-      fusion, can_override_empty_allocation_domain, may_alias_intermediate);
+  analysis.finalize(can_override_empty_allocation_domain);
   return analysis;
 }
 

diff --git a/csrc/alias_analysis.h b/csrc/alias_analysis.h
@@ -59,10 +59,7 @@ class AliasAnalysisResult {
   // Computes transitive aliases and caches them in `alias_to_root_`.
   // See `findAliases` for the meaning of
   // `can_override_empty_allocation_domain`.
-  void finalize(
-      Fusion* fusion,
-      bool can_override_empty_allocation_domain,
-      bool can_alias_intermediate);
+  void finalize(bool can_override_empty_allocation_domain);
 
   // Returns the preferred layout. If `alias` is not in `alias_to_source_`,
   // returns the `TensorView`'s initial layout.
@@ -116,7 +113,6 @@ class AliasAnalysisResult {
 // Fusion::aliasOutputToInput to mark aliases.
 AliasAnalysisResult findAliases(
     Fusion* fusion,
-    bool can_override_empty_allocation_domain = true,
-    bool may_alias_intermediate = false);
+    bool can_override_empty_allocation_domain = true);
 
 } // namespace nvfuser
diff --git a/csrc/preseg_passes/mark_aliases_prepare.cpp b/csrc/preseg_passes/mark_aliases_prepare.cpp
@@ -15,10 +15,8 @@
 namespace nvfuser::preseg_passes {
 
 void MarkAliasesPreparePass::runPass(Fusion* fusion) {
-  const AliasAnalysisResult analysis = findAliases(
-      fusion,
-      /*can_override_empty_allocation_domain=*/true,
-      /*may_alias_intermediate=*/true);
+  const AliasAnalysisResult analysis =
+      findAliases(fusion, /*can_override_empty_allocation_domain=*/true);
   if (isDebugDumpEnabled(DebugDumpOption::PreSegmenterLogging)) {
     debug() << "Alias analysis result:" << std::endl;
     debug() << analysis.toString(/*indent_size=*/1) << std::endl;

diff --git a/csrc/scheduler/mark_aliases.cpp b/csrc/scheduler/mark_aliases.cpp
@@ -25,10 +25,8 @@ void markAliases(Fusion* fusion) {
     fusion->printMath();
   }
 
-  const AliasAnalysisResult analysis = findAliases(
-      fusion,
-      /*can_override_empty_allocation_domain=*/false,
-      /*may_alias_intermediate=*/false);
+  const AliasAnalysisResult analysis =
+      findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
   if (isDebugDumpEnabled(DebugDumpOption::SchedulerVerbose)) {
     vlog("Alias analysis result:\n", analysis.toString(/*indent_size=*/1));
   }

diff --git a/csrc/scheduler/no_op.cpp b/csrc/scheduler/no_op.cpp
@@ -32,10 +32,8 @@ NoOpScheduler::NoOpScheduler(
 
 namespace {
 bool allOutputsArePointerArithmetics(Fusion* fusion) {
-  const AliasAnalysisResult analysis = findAliases(
-      fusion,
-      /*can_override_empty_allocation_domain=*/false,
-      /*may_alias_intermediate=*/false);
+  const AliasAnalysisResult analysis =
+      findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
   auto out_tvs = ir_utils::filterByType<TensorView>(fusion->outputs());
   return std::all_of(
       out_tvs.begin(), out_tvs.end(), [&analysis, fusion](TensorView* out) {

diff --git a/csrc/scheduler/reduction_utils.cpp b/csrc/scheduler/reduction_utils.cpp
@@ -937,20 +937,5 @@ std::ostream& operator<<(std::ostream& os, ReductionType reduction_type) {
   return os;
 }
 
-TensorView* getRepresentativeReductionTv(
-    const std::vector<TensorView*>& reduction_tvs) {
-  TensorView* inner_ref = nullptr;
-  TensorView* outer_ref = nullptr;
-  for (auto tv : reduction_tvs) {
-    if (scheduler_utils::isFastestDimReduction(tv)) {
-      inner_ref = inner_ref == nullptr ? tv : inner_ref;
-    } else {
-      outer_ref = outer_ref == nullptr ? tv : outer_ref;
-    }
-  }
-  // use inner_ref for inner reduction and innerOuter reduction
-  // use outer_ref for outer reduction
-  return inner_ref != nullptr ? inner_ref : outer_ref;
-}
 } // namespace reduction_scheduler_utils
 } // namespace nvfuser
diff --git a/csrc/scheduler/reduction_utils.h b/csrc/scheduler/reduction_utils.h
@@ -105,13 +105,5 @@ std::string toString(ReductionType reduction_type);
 ReductionType getReductionType(Fusion* fusion);
 ReductionType getReductionType(const std::vector<TensorView*>& reduction_tvs);
 
-//! Get the representative reduction tv from the given reduction tvs.
-//! If there are no reduction tvs, return nullptr.
-//! If there are only inner reduction tvs, return the first inner reduction tv.
-//! If there are only outer reduction tvs, return the first outer reduction tv.
-//! If there are both inner and outer reduction tvs, return the first inner
-//! reduction tv.
-TensorView* getRepresentativeReductionTv(
-    const std::vector<TensorView*>& reduction_tvs);
 } // namespace reduction_scheduler_utils
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp
@@ -47,10 +47,9 @@
 #include <iostream>
 
 namespace nvfuser {
-using testing::Contains;
-using testing::UnorderedElementsAre;
 
 using namespace at::indexing;
+
 using GpuViewTest = NVFuserTest;
 
 TEST_F(GpuViewTest, FusionViewDtypeSameSizeOutput) {
@@ -2380,102 +2379,6 @@ TEST_F(GpuViewTest, SplitMergePointwiseSplitMerge) {
   testValidate(executor_cache.fusion(), {cg_outputs}, {t0}, __LINE__, __FILE__);
 }
 
-// segmented into 2 kernels: pointwise and reduction
-TEST_F(GpuViewTest, GroupNormOriginal) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-  const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
-  const std::vector<int64_t> input_shape = {N, C, H, W};
-  const std::vector<int64_t> group_shape = {N, G, C / G, H, W};
-  const std::vector<int64_t> input_shape_wb = {C};
-  const std::vector<int64_t> group_shape_wb = {G, C / G};
-  DataType dtype = DataType::Half;
-  auto tv0 = makeContigTensor(input_shape.size(), dtype);
-  auto tv1 = makeContigTensor(input_shape_wb.size(), DataType::Float);
-  auto tv2 = makeContigTensor(input_shape_wb.size(), DataType::Float);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-  // pointwise ops, e.g. cast
-  auto tv3 = castOp(DataType::Float, tv0);
-  // reshape from {N, C, H, W} to {N, G, C / G, H, W}
-  auto tv4 = reshape(tv3, input_shape, group_shape);
-  // normalization
-  auto tv5 = sum(tv4, {-1, -2, -3});
-  auto tv6 = broadcast(tv5, {false, false, true, true, true});
-  auto tv7 = div(tv4, tv6);
-  // reshape back to {N, C, H, W}
-  auto tv8 = reshape(tv7, group_shape, input_shape);
-  // pointwise ops, e.g. scale, bias, cast
-  auto tv9 = broadcast(tv1, {true, false, true, true});
-  auto tv10 = broadcast(tv2, {true, false, true, true});
-  auto tv11 = mul(tv8, tv9);
-  auto tv12 = add(tv11, tv10);
-  auto tv13 = castOp(dtype, tv12);
-  fusion->addOutput(tv13);
-
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto options_wb = at::TensorOptions()
-                        .dtype(data_type_to_aten(DataType::Float))
-                        .device(at::kCUDA, 0);
-  auto t0 = at::randn(input_shape, options);
-  auto tw = at::randn(input_shape_wb, options_wb);
-  auto tb = at::randn(input_shape_wb, options_wb);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs({t0, tw, tb});
-  // should expect 1 after adding a pre-segment pass to move reshape to input
-  // and output.
-  EXPECT_THAT(
-      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(),
-      UnorderedElementsAre(
-          HeuristicIs(ScheduleHeuristic::PointWise),
-          HeuristicIs(ScheduleHeuristic::Reduction)));
-
-  testValidate(
-      executor_cache.fusion(), cg_outputs, {t0, tw, tb}, __LINE__, __FILE__);
-}
-
-TEST_F(GpuViewTest, OutputAliasIntermediate) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-  const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
-  const std::vector<int64_t> input_shape = {N, G, C / G, H, W};
-  const std::vector<int64_t> output_shape = {N, C, H, W};
-  DataType dtype = DataType::Half;
-  auto tv0 = makeContigTensor(input_shape.size(), dtype);
-  fusion->addInput(tv0);
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = sum(tv2, {-1, -2, -3});
-  auto tv4 = broadcast(tv3, {false, false, true, true, true});
-  auto tv5 = div(tv2, tv4);
-  auto tv6 = castOp(dtype, tv5);
-  auto tv7 = reshape(tv6, input_shape, output_shape);
-  fusion->addOutput(tv7);
-
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto t0 = at::randn(input_shape, options);
-  auto t1 = t0.to(at::kFloat);
-  auto t2 = t1.sum({-1, -2, -3}).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1);
-  auto t3 = t1 / t2;
-  auto t4 = t3.reshape(output_shape);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
-  const std::vector<SegmentedGroup*>& seg_groups =
-      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
-  EXPECT_THAT(
-      seg_groups, Contains(HeuristicIs(ScheduleHeuristic::NoOp)).Times(1));
-  EXPECT_THAT(
-      seg_groups,
-      Contains(HeuristicIs(ScheduleHeuristic::InnerPersistent)).Times(1));
-  testValidate(
-      executor_cache.fusion(), cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
-}
-
 using ReductionAxes = std::vector<int64_t>;
 class ViewReductionTest : public NVFuserFixtureParamTest<ReductionAxes> {};
 
@@ -2619,4 +2522,5 @@ TEST_F(GpuViewTest, GroupNorm) {
   testValidate(
       executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
+
 } // namespace nvfuser