NVIDIA · jjsjann123 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/csrc/scheduler/pointwise.cpp b/csrc/scheduler/pointwise.cpp
@@ -43,7 +43,7 @@ class DomainMap : public pointwise_utils::DomainMap {
       if (isValidReference(output_tv) &&
           hasMinimumSize(output_tv, minimum_num_axes) &&
           !output_tv->isFusionInput()) {
-        int64_t n_dims = pointwise_utils::nRootDims(output_tv);
+        int64_t n_dims = pointwise_utils::nLogicalDims(output_tv);
         if (n_dims > max_dims) {
           result = output_tv;
           max_dims = n_dims;
@@ -529,11 +529,11 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams* pparams) {
 
   int64_t max_dims = 0;
   for (auto inp : input_tvs) {
-    max_dims = std::max(pointwise_utils::nRootDims(inp), max_dims);
+    max_dims = std::max(pointwise_utils::nLogicalDims(inp), max_dims);
   }
 
   for (auto out : output_tvs) {
-    max_dims = std::max(pointwise_utils::nRootDims(out), max_dims);
+    max_dims = std::max(pointwise_utils::nLogicalDims(out), max_dims);
   }
 
   // If everything is zero dim tensors, just return.

diff --git a/csrc/scheduler/pointwise_utils.cpp b/csrc/scheduler/pointwise_utils.cpp
@@ -142,6 +142,137 @@ bool DomainMap::areAllInputIdsMappedTo(TensorView* input_tv, TensorView* tv)
   return in_concrete_ids.empty();
 }
 
+// Note: ideally we would want to check that reference_tv contains all iter
+// domains in target_tv, so that transformation applied on reference_tv can be
+// propagated to target_tv. But we don't have an easy way to check that. Instead
+// of that, this function checks that all source iter domains involved in
+// transformation on target_tv is covered by reference_tv. Source iter domains
+// of TensorViews are IDs that doesn't have an definition and are producers of
+// any IDs on the logical domain of the given TensorView.
+//
+// ------
+//
+// e.g 0.
+//   T34 [i0, i1]
+//   T185 [i0, b2, i1]     = broadcast(T34)
+//   T192 [i0, b3(ex), i1] = expand(T185)
+//   T198 [i0, b3(ex)*i1]  = reshape(T192)
+//   output(T34)
+//   output(T198)
+//
+// if we consider taking T34 as reference_tv. T198 is the target_tv. We can't
+// replay T34's transform of merging all the dimensions to T198, since b3(ex)*i1
+// can't be reversed. The check in this function would give us T34 with source
+// i0, i1; where T198 would have source i0, b3, i1, where b3 isn't contained in
+// T34. Hence we'll reject this reference_tv.
+//
+// ------
+//
+// e.g 1.
+//   T0 [i0, i1]
+//   T1 [i2, i0, i1]
+//   T2 [i0*i1]      = reshape(T0)
+//   T3 [b3, i0, i1] = broadcast(T0)
+//   T4 [i2, i0, i1] = add(T1, T3)
+//   output(T2)
+//   output(T4)
+//
+// the example above should be able to pick T4 as reference_tv. T2's source i0,
+// i1 are both contained by the source of T4, so this example could be scheduled
+// as a single fusion.
+bool DomainMap::areAllTargetIdsCoveredBy(
+    TensorView* target_tv,
+    TensorView* reference_tv) const {
+  auto get_source_iter_domains = [this](TensorView* tv) {
+    // traverse back to collect all disjoint set producer IDs for each ID in the
+    // logical domain of tv.
+    VectorOfUniqueEntries<std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>>
+        all_producer_sets;
+    std::for_each(
+        tv->getLogicalDomain().begin(),
+        tv->getLogicalDomain().end(),
+        [&](IterDomain* tv_logical_id) {
+          all_producer_sets.pushBack(
+              ca_map_.disjointSetOf(tv_logical_id, IdMappingMode::EXACT));
+        });
+    all_producer_sets.pushBack(
+        ca_map_.getAllDisjointSetProducers(all_producer_sets));
+
+    std::vector<IterDomain*> source_ids;
+    // filtering all producer IDs with empty definition to get source iter
+    // domains
+    std::for_each(
+        all_producer_sets.vector().begin(),
+        all_producer_sets.vector().end(),
+        [&source_ids,
+         this](const std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>&
+                   producer_set_ptr) {
+          IterDomain* producer_id = producer_set_ptr->front();
+          if (ca_map_.uniqueExactDefinitions(producer_id).empty()) {
+            source_ids.push_back(producer_id);
+          }
+        });
+    return source_ids;
+  };
+
+  // this contains all source iter domain that's covered by reference_tv, so
+  // it's safe for target_tv to have them.
+  std::unordered_set<IterDomain*> covered_source_ids;
+  for (IterDomain* source_id_ref : get_source_iter_domains(reference_tv)) {
+    covered_source_ids.insert(source_id_ref);
+  }
+  // It's safe to have unmapped broadcast IterDomain. There're quite a few tests
+  // expecting pointwise scheduler to handle this pattern
+  for (IterDomain* id_out : target_tv->getLogicalDomain()) {
+    if (id_out->isBroadcast()) {
+      NVF_ERROR(
+          id_out->definition() == nullptr ||
+          id_out->definition()->isA<Resize>());
+
+      // Note that ideally we should also be able to handle merge/split on
+      // broadcast IDs, so we should really move this skip inside the loop below
+      // `get_source_iter_domains(target_tv)` and skip broadcast source IDs.
+      // currently we have the issue that split/merge does not preserve expanded
+      // broadcasts, see issue: https://github.com/NVIDIA/Fuser/issues/1126
+      covered_source_ids.insert(id_out);
+    }
+  }
+  // Note: there's certain cases where it's safe to have dangling IDs,
+  // e.g
+  //   T34  [i0, i1]
+  //   T185 [i0, b2, i1]     = broadcast(T34)
+  //   T192 [i0, b3(ex), i1] = expand(T185)
+  // It's safe to propagate T34 to T192, since b3(ex) is not involved in the
+  // propagation. But this isn't generally safe. If the above example is changed
+  // to e.g
+  //   T34  [i0, i1]
+  //   T185 [i0, b2, i1]     = broadcast(T34)
+  //   T186 [i0, i4, i1]     = ones({i0, i4, i1})
+  //   T193 [i0, i4, i1]     = add(T185, T186)
+  // It's unsafe to propagate from T34 to T193, see issue
+  // https://github.com/NVIDIA/Fuser/issues/3542
+
+  // Check all source iter domain involved in producing target_tv
+  for (IterDomain* source_id_out : get_source_iter_domains(target_tv)) {
+    // NOTE: we use concrete id instead. This allows us to link indirect
+    // broadcast. So in the example below: T2[i0, i1] = T0[i0, b0] + T1[i0, i1]
+    // T3[i0, i9] = pad(T0[i0, b0])
+    // We have i9 in T3
+    //     -> source ID b0
+    //     -> concrete map to i1
+    // So T3 is contained by T2. See test `PointwiseTest.DomainMapPad1`
+    auto concrete_source_id_out =
+        ca_map_.getConcreteMappedID(source_id_out, IdMappingMode::PERMISSIVE);
+    // if we find any source_id_out that's not contained, it's possible our
+    // propagation would fail since transformation involving this iter domain
+    // can't be resolved.
+    if (!getMappedInputConcreteID(covered_source_ids, concrete_source_id_out)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Reference domains must exactly match with the input domains. See
 // also PR #661
 IterDomain* DomainMap::getMappedInputConcreteID(
@@ -233,7 +364,7 @@ IterDomain* DomainMap::anyMapped(
 }
 
 // Determine if output TensorView is a valid reference tensor for this fusion.
-// The reference tensor must map to all the iterDomains in each input.
+// The reference tensor must map to all the iterDomains in each input and output
 bool DomainMap::isValidReference(TensorView* tv) const {
   for (auto input_tv : ir_utils::filterByType<TensorView>(fusion_->inputs())) {
     if (input_tv->uses().empty()) {
@@ -245,6 +376,20 @@ bool DomainMap::isValidReference(TensorView* tv) const {
       return false;
     }
   }
+  // The check on outputs are optional, transpose scheduler might propose a
+  // secondary reference that only applies to a subset of IO tensors. Ideally we
+  // should have a more robust check and consider the IO groups instead of
+  // blindly skip outputs.
+  for (auto output_tv :
+       ir_utils::filterByType<TensorView>(fusion_->outputs())) {
+    // no need to check for self.
+    if (output_tv == tv) {
+      continue;
+    }
+    if (!areAllTargetIdsCoveredBy(output_tv, tv)) {
+      return false;
+    }
+  }
   return true;
 }
 

diff --git a/csrc/scheduler/pointwise_utils.h b/csrc/scheduler/pointwise_utils.h
@@ -28,14 +28,21 @@ class DomainMap {
   }
 
   // Determine if a TensorView is a valid reference tensor for this fusion.
-  // The reference tensor must map to all the iterDomains in each input.
+  // The reference tensor must map to all the iterDomains in each input and
+  // output.
   bool isValidReference(TensorView* tv) const;
 
  protected:
   // Determine if all IterDomains are mapped between input and the given tvs
   bool areAllInputIdsMappedTo(TensorView* input_tv, TensorView* output_tv)
       const;
 
+  // Determine if all source IterDomains in target_tv are contained by the
+  // reference_tv, this ensures transformations from reference_tv can be
+  // propagated to target_tv
+  bool areAllTargetIdsCoveredBy(TensorView* target_tv, TensorView* reference_tv)
+      const;
+
   virtual IterDomain* getMappedInputConcreteID(
       const std::unordered_set<IterDomain*>& in_concrete_ids,
       IterDomain* out_id) const;
@@ -63,7 +70,7 @@ class DomainMap {
 
 // Returns number of non-reduction/non-broadcas/non-device dims in logical
 // domain
-inline int64_t nRootDims(const TensorView* tv) {
+inline int64_t nLogicalDims(const TensorView* tv) {
   auto logical_dom = tv->getLogicalDomain();
   int64_t tv_n_dims = 0;
   for (auto dim : logical_dom) {

diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp
@@ -170,8 +170,16 @@ class DomainMap : public pointwise_utils::DomainMap {
     TensorView* result = nullptr;
     int64_t max_dims = -1;
     for (auto tv : group) {
+      // since transpose scheduler have different set of reference, we skip IDs
+      // coverage check of the reference on outputs of the fusion. Note that
+      // this is not ideal, we would want to instead have reference tensor
+      // checked against all its target IO tensors.
+      // TODO: open an issue for this one. transpose scheduler is not supposed
+      // to reuse pointwise_utils::DomainMap::isValidRefrence. This function is
+      // too restrictive and doesn't align well with the scheme of transpose
+      // scheduler
       if (isValidReference(tv)) {
-        int64_t dims = (int64_t)pointwise_utils::nRootDims(tv);
+        int64_t dims = (int64_t)pointwise_utils::nLogicalDims(tv);
         if (dims > max_dims) {
           result = tv;
           max_dims = dims;
@@ -990,12 +998,12 @@ std::unique_ptr<TransposeParams> getTransposeHeuristics(
             << "max_io_dtype_size: " << max_io_dtype_size << "\n"
             << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0])
             << "\n"
-            << "reference1: " << reference1 << "\n"
+            << "reference1: " << reference1->toString() << "\n"
             << "inner_most_id1 position: " << inner_most_pos1_in_ref1
             << " (in reference 1)\n"
             << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1])
             << "\n"
-            << "reference2: " << reference2 << "\n"
+            << "reference2: " << reference2->toString() << "\n"
             << "inner_most_id2 position: " << inner_most_pos2_in_ref1
             << " (in reference 1)" << std::endl;
     if (hasSmallTransposeDimensions(tparams)) {
@@ -1045,11 +1053,11 @@ void scheduleTranspose(Fusion* fusion, const TransposeParams* tparams) {
 
   int64_t max_dims = 0;
   for (auto inp : input_tvs) {
-    max_dims = std::max(pointwise_utils::nRootDims(inp), max_dims);
+    max_dims = std::max(pointwise_utils::nLogicalDims(inp), max_dims);
   }
 
   for (auto out : output_tvs) {
-    max_dims = std::max(pointwise_utils::nRootDims(out), max_dims);
+    max_dims = std::max(pointwise_utils::nLogicalDims(out), max_dims);
   }
 
   // If everything is zero dim tensors, just return.