NVIDIA · liqiangxl · Jul 19, 2024 · Jun 8, 2024 · Jun 8, 2024 · Jun 13, 2024
diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp
@@ -9,6 +9,7 @@
 #include <vector>
 
 #include <alias_analysis.h>
+#include <compute_at_map.h>
 #include <dispatch.h>
 #include <fusion.h>
 #include <ir/interface_nodes.h>
@@ -17,6 +18,8 @@
 #include <ir/utils.h>
 #include <linked_hash_map.h>
 #include <logical_domain_map.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/registry_utils.h>
 
 namespace nvfuser {
 
@@ -425,18 +428,69 @@ bool okToRelayout(
                                             : tv->getMaybeAllocationDomain());
   return new_layout.isCompliantWith({allocation, tv->getContiguity()});
 }
+
+// Returns true if the output tv's definition op may cause segmentation.
+// Only considered view op interfering with reduction.
+// TODO: other ops?
+bool outputInterferingReduction(Fusion* fusion) {
+  // output must be defined as view op
+  const auto& output_tvs =
+      ir_utils::filterByType<TensorView>(fusion->outputs());
+  if (std::none_of(
+          output_tvs.begin(), output_tvs.end(), [](TensorView* out_tv) {
+            return out_tv->definition()->isA<ViewOp>();
+          })) {
+    return false;
+  }
+
+  // fusion must have reduction tvs
+  const auto& reduction_tvs = scheduler_utils::getReductionTvs(fusion);
+  if (reduction_tvs.empty()) {
+    return false;
+  }
+
+  ComputeAtMap ca_map(fusion);
+  if (registry_utils::requiresForwardViewReplay(fusion, ca_map)) {
+    return true;
+  }
+
+  // check if view op interferes with reduction
+  auto ref_redu_tv =
+      reduction_scheduler_utils::getRepresentativeReductionTv(reduction_tvs);
+  if (registry_utils::reductionInterferingView(fusion, ca_map, ref_redu_tv)) {
+    return true;
+  }
+
+  return false;
+}
+
+// Stop at view op
+// TODO: other ops?
+bool isOpsToStop(const Expr* expr, bool stop_at_view) {
+  return stop_at_view && expr->isA<ViewOp>();
+}
+
 } // namespace
 
 void AliasAnalysisResult::finalize(
-    const bool can_override_empty_allocation_domain) {
+    Fusion* fusion,
+    const bool can_override_empty_allocation_domain,
+    const bool may_alias_intermediate) {
+  // If allow output to alias intermediate, then we don't need to walk up
+  // the chain to find an input or output root. It changes the last reshape
+  // in group norm to a no-op.
+  bool stop_at_view =
+      may_alias_intermediate && outputInterferingReduction(fusion);
   for (auto [alias, source_and_layout] : alias_to_source_) {
     auto [root, preferred_layout] = source_and_layout;
-    // Walks up the `alias_to_source_` chain.
-    while (root != nullptr && !root->isFusionInput() &&
-           !root->isFusionOutput()) {
-      const auto i = alias_to_source_.find(root);
-      root = (i == alias_to_source_.end() ? nullptr : i->second.first);
+    if (!isOpsToStop(alias->definition(), stop_at_view)) {
+      while (root != nullptr && !root->isFusionInput() &&
+             !root->isFusionOutput()) {
+        const auto i = alias_to_source_.find(root);
+        root = (i == alias_to_source_.end() ? nullptr : i->second.first);
+      }
     }
+
     if (root == nullptr) {
       continue;
     }
@@ -487,7 +541,8 @@ std::string AliasAnalysisResult::toString(const int indent_size) const {
 
 AliasAnalysisResult findAliases(
     Fusion* fusion,
-    const bool can_override_empty_allocation_domain) {
+    const bool can_override_empty_allocation_domain,
+    const bool may_alias_intermediate) {
   AliasAnalysisResult analysis;
   AliasFinder finder(analysis);
   // Fusion::exprs() computes and returns topological order.
@@ -499,7 +554,9 @@ AliasAnalysisResult findAliases(
     // results).
     finder.dispatch(expr);
   }
-  analysis.finalize(can_override_empty_allocation_domain);
+
+  analysis.finalize(
+      fusion, can_override_empty_allocation_domain, may_alias_intermediate);
   return analysis;
 }
 

diff --git a/csrc/alias_analysis.h b/csrc/alias_analysis.h
@@ -59,7 +59,10 @@ class AliasAnalysisResult {
   // Computes transitive aliases and caches them in `alias_to_root_`.
   // See `findAliases` for the meaning of
   // `can_override_empty_allocation_domain`.
-  void finalize(bool can_override_empty_allocation_domain);
+  void finalize(
+      Fusion* fusion,
+      bool can_override_empty_allocation_domain,
+      bool can_alias_intermediate);
 
   // Returns the preferred layout. If `alias` is not in `alias_to_source_`,
   // returns the `TensorView`'s initial layout.
@@ -113,6 +116,7 @@ class AliasAnalysisResult {
 // Fusion::aliasOutputToInput to mark aliases.
 AliasAnalysisResult findAliases(
     Fusion* fusion,
-    bool can_override_empty_allocation_domain = true);
+    bool can_override_empty_allocation_domain = true,
+    bool may_alias_intermediate = false);
 
 } // namespace nvfuser
diff --git a/csrc/preseg_passes/mark_aliases_prepare.cpp b/csrc/preseg_passes/mark_aliases_prepare.cpp
@@ -15,8 +15,10 @@
 namespace nvfuser::preseg_passes {
 
 void MarkAliasesPreparePass::runPass(Fusion* fusion) {
-  const AliasAnalysisResult analysis =
-      findAliases(fusion, /*can_override_empty_allocation_domain=*/true);
+  const AliasAnalysisResult analysis = findAliases(
+      fusion,
+      /*can_override_empty_allocation_domain=*/true,
+      /*may_alias_intermediate=*/true);
   if (isDebugDumpEnabled(DebugDumpOption::PreSegmenterLogging)) {
     debug() << "Alias analysis result:" << std::endl;
     debug() << analysis.toString(/*indent_size=*/1) << std::endl;

diff --git a/csrc/scheduler/mark_aliases.cpp b/csrc/scheduler/mark_aliases.cpp
@@ -25,8 +25,10 @@ void markAliases(Fusion* fusion) {
     fusion->printMath();
   }
 
-  const AliasAnalysisResult analysis =
-      findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
+  const AliasAnalysisResult analysis = findAliases(
+      fusion,
+      /*can_override_empty_allocation_domain=*/false,
+      /*may_alias_intermediate=*/false);
   if (isDebugDumpEnabled(DebugDumpOption::SchedulerVerbose)) {
     vlog("Alias analysis result:\n", analysis.toString(/*indent_size=*/1));
   }

diff --git a/csrc/scheduler/no_op.cpp b/csrc/scheduler/no_op.cpp
@@ -32,8 +32,10 @@ NoOpScheduler::NoOpScheduler(
 
 namespace {
 bool allOutputsArePointerArithmetics(Fusion* fusion) {
-  const AliasAnalysisResult analysis =
-      findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
+  const AliasAnalysisResult analysis = findAliases(
+      fusion,
+      /*can_override_empty_allocation_domain=*/false,
+      /*may_alias_intermediate=*/false);
   auto out_tvs = ir_utils::filterByType<TensorView>(fusion->outputs());
   return std::all_of(
       out_tvs.begin(), out_tvs.end(), [&analysis](TensorView* out) {

diff --git a/csrc/scheduler/reduction_utils.cpp b/csrc/scheduler/reduction_utils.cpp
@@ -937,5 +937,20 @@ std::ostream& operator<<(std::ostream& os, ReductionType reduction_type) {
   return os;
 }
 
+TensorView* getRepresentativeReductionTv(
+    const std::vector<TensorView*>& reduction_tvs) {
+  TensorView* inner_ref = nullptr;
+  TensorView* outer_ref = nullptr;
+  for (auto tv : reduction_tvs) {
+    if (scheduler_utils::isFastestDimReduction(tv)) {
+      inner_ref = inner_ref == nullptr ? tv : inner_ref;
+    } else {
+      outer_ref = outer_ref == nullptr ? tv : outer_ref;
+    }
+  }
+  // use inner_ref for inner reduction and innerOuter reduction
+  // use outer_ref for outer reduction
+  return inner_ref != nullptr ? inner_ref : outer_ref;
+}
 } // namespace reduction_scheduler_utils
 } // namespace nvfuser
diff --git a/csrc/scheduler/reduction_utils.h b/csrc/scheduler/reduction_utils.h
@@ -105,5 +105,13 @@ std::string toString(ReductionType reduction_type);
 ReductionType getReductionType(Fusion* fusion);
 ReductionType getReductionType(const std::vector<TensorView*>& reduction_tvs);
 
+//! Get the representative reduction tv from the given reduction tvs.
+//! If there are no reduction tvs, return nullptr.
+//! If there are only inner reduction tvs, return the first inner reduction tv.
+//! If there are only outer reduction tvs, return the first outer reduction tv.
+//! If there are both inner and outer reduction tvs, return the first inner
+//! reduction tv.
+TensorView* getRepresentativeReductionTv(
+    const std::vector<TensorView*>& reduction_tvs);
 } // namespace reduction_scheduler_utils
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp
@@ -47,9 +47,9 @@
 #include <iostream>
 
 namespace nvfuser {
+using testing::Contains;
 
 using namespace at::indexing;
-
 using GpuViewTest = NVFuserTest;
 
 TEST_F(GpuViewTest, FusionViewDtypeSameSizeOutput) {
@@ -2379,4 +2379,121 @@ TEST_F(GpuViewTest, SplitMergePointwiseSplitMerge) {
   testValidate(executor_cache.fusion(), {cg_outputs}, {t0}, __LINE__, __FILE__);
 }
 
+namespace {
+MATCHER_P(HeuristicIs, heuristic, "") {
+  return arg->heuristic() == heuristic;
+}
+} // namespace
+
+// segmented into 3 kernels: pointwise, normalization, and pointwise
+TEST_F(GpuViewTest, GroupNormOriginal) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
+  const std::vector<int64_t> input_shape = {N, C, H, W};
+  const std::vector<int64_t> group_shape = {N, G, C / G, H, W};
+  const std::vector<int64_t> input_shape_wb = {C};
+  const std::vector<int64_t> group_shape_wb = {G, C / G};
+  DataType dtype = DataType::Half;
+  auto tv0 = makeContigTensor(input_shape.size(), dtype);
+  auto tv1 = makeContigTensor(input_shape_wb.size(), DataType::Float);
+  auto tv2 = makeContigTensor(input_shape_wb.size(), DataType::Float);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+  // pointwise ops, e.g. cast
+  auto tv3 = castOp(DataType::Float, tv0);
+  // reshape from {N, C, H, W} to {N, G, C / G, H, W}
+  auto tv4 = reshape(tv3, input_shape, group_shape);
+  // normalization
+  auto tv5 = sum(tv4, {-1, -2, -3});
+  auto tv6 = broadcast(tv5, {false, false, true, true, true});
+  auto tv7 = div(tv4, tv6);
+  // reshape back to {N, C, H, W}
+  auto tv8 = reshape(tv7, group_shape, input_shape);
+  // pointwise ops, e.g. scale, bias, cast
+  auto tv9 = broadcast(tv1, {true, false, true, true});
+  auto tv10 = broadcast(tv2, {true, false, true, true});
+  auto tv11 = mul(tv8, tv9);
+  auto tv12 = add(tv11, tv10);
+  auto tv13 = castOp(dtype, tv12);
+  fusion->addOutput(tv13);
+
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto options_wb = at::TensorOptions()
+                        .dtype(data_type_to_aten(DataType::Float))
+                        .device(at::kCUDA, 0);
+  auto t0 = at::randn(input_shape, options);
+  auto tw = at::randn(input_shape_wb, options_wb);
+  auto tb = at::randn(input_shape_wb, options_wb);
+  auto t1 = t0.reshape(group_shape).to(at::kFloat);
+  auto t2 = t1.sum({-1, -2, -3}).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1);
+  auto t3 = t1 / t2;
+  auto t4 = t3.reshape(input_shape);
+  auto t5 = tw.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
+  auto t6 = tb.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
+  auto t7 = t4.mul(t5).add(t6);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, tw, tb});
+  // should expect 1 after adding a pre-segment pass to move reshape to input
+  // and output.
+  auto seg_groups =
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+  EXPECT_THAT(
+      seg_groups, Contains(HeuristicIs(ScheduleHeuristic::PointWise)).Times(2));
+  EXPECT_THAT(
+      seg_groups,
+      Contains(HeuristicIs(ScheduleHeuristic::InnerPersistent)).Times(1));
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      {t0, tw, tb},
+      {t7},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(GpuViewTest, OutputAliasIntermediate) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
+  const std::vector<int64_t> input_shape = {N, G, C / G, H, W};
+  const std::vector<int64_t> output_shape = {N, C, H, W};
+  DataType dtype = DataType::Half;
+  auto tv0 = makeContigTensor(input_shape.size(), dtype);
+  fusion->addInput(tv0);
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {-1, -2, -3});
+  auto tv4 = broadcast(tv3, {false, false, true, true, true});
+  auto tv5 = div(tv2, tv4);
+  auto tv6 = castOp(dtype, tv5);
+  auto tv7 = reshape(tv6, input_shape, output_shape);
+  fusion->addOutput(tv7);
+
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto t0 = at::randn(input_shape, options);
+  auto t1 = t0.to(at::kFloat);
+  auto t2 = t1.sum({-1, -2, -3}).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1);
+  auto t3 = t1 / t2;
+  auto t4 = t3.reshape(output_shape);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+  const std::vector<SegmentedGroup*>& seg_groups =
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+  EXPECT_THAT(
+      seg_groups, Contains(HeuristicIs(ScheduleHeuristic::NoOp)).Times(1));
+  EXPECT_THAT(
+      seg_groups,
+      Contains(HeuristicIs(ScheduleHeuristic::InnerPersistent)).Times(1));
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser