Use loop promotion and indexing traversal graph to find mismatched pa…

…rallelization (#2875) See #2850 Stacked on #2901 The old code is still used by default. With `NVFUSER_ENABLE=id_model`, the new analysis is used. It's also used for tensors with non-conventional domains. This is required for #2851. It also enables previously disabled parallelization of the mismatching reshape test from #2684. I validated the change by comparing the results between the existing and new analyses with all the tests and benchmarks. The only mismatch was with the mismatching reshape test, for which the existing analysis declared a sync is required, whereas the new one correctly recognizes there's no cross-thread dependency.
NVIDIA · Sep 5, 2024 · 1158543 · 1158543
1 parent d5b2a24
commit 1158543
Show file tree

Hide file tree

Showing 5 changed files with 169 additions and 29 deletions.
diff --git a/csrc/device_lower/analysis/sync_information.cpp b/csrc/device_lower/analysis/sync_information.cpp
@@ -7,8 +7,11 @@
 // clang-format on
 #include <device_lower/analysis/index_compute.h>
 #include <device_lower/lower2device.h>
+#include <id_model/indexing.h>
+#include <id_model/indexing_utils.h>
 #include <instrumentation.h>
 #include <ir/utils.h>
+#include <transform_iter.h>
 
 #include <device_lower/analysis/sync_information.h>
 
@@ -492,7 +495,7 @@ SyncMap::SyncMap(Fusion* fusion) {
           producer_redundant_types & (~producer_redundant_use_types);
 
       for (const auto producer_i : c10::irange(producer->nDims())) {
-        auto producer_axis = producer->axis(producer_i);
+        auto producer_axis = producer->getLoopDomain().at(producer_i);
         auto producer_ptype =
             ca_map->getConcreteMappedID(producer_axis, IdMappingMode::LOOP)
                 ->getParallelType();
@@ -516,7 +519,7 @@ SyncMap::SyncMap(Fusion* fusion) {
         std::vector<IterDomain*> consumer_parallel_ids(
             ParallelTypeBitmap::kNumParallelTypes, nullptr);
         for (const auto consumer_i : c10::irange(consumer->nDims())) {
-          auto consumer_axis = consumer->axis(consumer_i);
+          auto consumer_axis = consumer->getLoopDomain().at(consumer_i);
           auto consumer_ptype =
               ca_map->getConcreteMappedID(consumer_axis, IdMappingMode::LOOP)
                   ->getParallelType();
@@ -541,6 +544,23 @@ SyncMap::SyncMap(Fusion* fusion) {
 
         ProducerConsumerIndexingInfoCache indexing_info(producer, consumer);
 
+        // P2C map is required when using the IdModel-based analysis
+        const std::unordered_map<IterDomain*, IterDomain*>
+            p2c_map_no_forwarding = GpuLower::current()->hasIdModel()
+            ? BestEffortReplay(
+                  consumer->getLoopDomain(),
+                  producer->getLoopDomain(),
+                  PairwiseLogicalDomainMap(producer, consumer)
+                      .mapProducerToConsumer(),
+                  /*replay_forward_id_map=*/{},
+                  /*target_forward_id_map=*/{},
+                  /*skip_replay_swizzle=*/false,
+                  /*skip_target_swizzle=*/false,
+                  /*skip_resize=*/false,
+                  /*error_on_failure=*/false)
+                  .getReplay()
+            : std::unordered_map<IterDomain*, IterDomain*>{};
+
         // At this point each parallel type that's present in the consumer or
         // the producer will be present in their corresponding `_parallel_ids`
         // map going from parallel index type (only size 6 for grid/block dims)
@@ -653,6 +673,7 @@ SyncMap::SyncMap(Fusion* fusion) {
                      producer->getLogicalDomain(), {p_id})
                      .empty()) {
               raw_dims.set(producer_ptype);
+              continue;
             }
           }
 
@@ -662,30 +683,115 @@ SyncMap::SyncMap(Fusion* fusion) {
             continue;
           }
 
-          // When the producer is parallelized, the producer and the
-          // consumer must use the same index with the same parallel
-          // type. Otherwise, a sync is required. This is not the case
-          // when this op is a parallel broadcast.
-
-          if (producer_parallel_bcast) {
-            // As long as they are permissively mapped using the same
-            // parallel type, no communication is required
-            if (producer_ptype == consumer_ptype &&
-                ca_map->areMapped(p_id, c_id, IdMappingMode::PERMISSIVE)) {
-              continue;
+          // Use the IdModel loop promotion when available. This is
+          // required for tensors with non-trivial loop domains
+          if (GpuLower::current()->hasIdModel()) {
+            if (producer_ptype == consumer_ptype) {
+              // Case 1:
+              // Producer loop ID: non-broadcast
+              // Consumer loop ID: non-broadcast
+              // -> No sync if they are exactly mapped. This case is covered by
+              // the promotion check.
+              //
+              // Case 2:
+              // Producer loop ID: broadcast (which may be produced by
+              // merging multiple broadcast domains)
+              // Consumer loop ID: non-broadcast
+              // -> They are not exactly mapped but sync is not necessary as
+              // discussed below.
+              //
+              // Case 3:
+              // Producer loop ID: non-broadcast
+              // Consumer loop ID: non-broadcast
+              // -> Sync required if they are not exactly mapped, even when they
+              // are mapped by the best effort replay. (See
+              // NVFuserTest.RAWSync for a concrete repro).
+
+              // Case 1
+              const auto& id_model = GpuLower::current()->idModel();
+              auto producer_loop_id =
+                  indexing_utils::getLoopPromotion(p_id, id_model);
+              auto consumer_loop_id =
+                  indexing_utils::getLoopPromotion(c_id, id_model);
+              const auto& indexing_traveral_graph =
+                  id_model.idGraph(TensorIndexer::traversalGraphType());
+              if (indexing_traveral_graph.disjointValSets().strictAreMapped(
+                      producer_loop_id, consumer_loop_id)) {
+                continue;
+              }
+
+              // Case 2
+              // If the producer ID is a broadcast, it does not
+              // require synchronization even when the producer and
+              // consumer domains are not promoted to the same
+              // group. For example,
+              //
+              // tv0: [i0]
+              // tv1: [b1]
+              // tv2 = tv1
+              // tv3 = tv0 + tv2
+              //
+              // tv2->axis(0)->parallelize(ParallelType::TIDx);
+              // tv3->axis(0)->parallelize(ParallelType::TIDx);
+              //
+              // Assume that there's no inlining. Since it isn't
+              // inlined, the loop domain of tv2 is not mapped with
+              // that of tv3, thus the avove condition won't
+              // hit. Still, since tv2 will be executed by all TIDx
+              // threads independently, there's no need of
+              // synchronization.
+              //
+              // Consider a similar case like below:
+              //
+              // tv0: [i0, i1]
+              // tv1: [i2, b3]
+              // tv2 = tv1
+              // tv3 = tv0 + tv2
+              //
+              // tv2->merge(0, 1);
+              // tv3->merge(0, 1);
+              // tv2->axis(0)->parallelize(ParallelType::TIDx);
+              // tv3->axis(0)->parallelize(ParallelType::TIDx);
+              //
+              // This case does require a synchronization since for
+              // tv2, TIDx will be used to parallelize the outer
+              // domain only, whereas for tv3 it is mapped to the
+              // merged domain of the outer and inner domains. In
+              // other words, if a broadcast becomes non-broadcast
+              // by getting merged with a non-broadcast domain, it
+              // requires a synchronization.
+              if (p_id->isBroadcast()) {
+                if (auto it = p2c_map_no_forwarding.find(p_id);
+                    it != p2c_map_no_forwarding.end() && it->second == c_id) {
+                  continue;
+                }
+              }
+            }
+          } else {
+            // When the producer is parallelized, the producer and the
+            // consumer must use the same index with the same parallel
+            // type. Otherwise, a sync is required. This is not the case
+            // when this op is a parallel broadcast.
+            if (producer_parallel_bcast) {
+              // As long as they are permissively mapped using the same
+              // parallel type, no communication is required
+              if (producer_ptype == consumer_ptype &&
+                  ca_map->areMapped(p_id, c_id, IdMappingMode::PERMISSIVE)) {
+                continue;
+              }
+              // Can this happen?
+              NVF_ERROR(
+                  false,
+                  "Unexpected case. Producer: ",
+                  producer->toString(),
+                  ", consumer: ",
+                  consumer->toString());
+            }
+            if (producer_ptype == consumer_ptype) {
+              if (useSameIndex(producer, p_id, consumer, c_id, indexing_info)) {
+                continue;
+              }
             }
-            // Can this happen?
-            NVF_ERROR(
-                false,
-                "Unexpected case. Producer: ",
-                producer->toString(),
-                ", consumer: ",
-                consumer->toString());
-          }
-
-          if (producer_ptype == consumer_ptype &&
-              useSameIndex(producer, p_id, consumer, c_id, indexing_info)) {
-            continue;
           }
 
           raw_dims.set(producer_ptype);

diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h
@@ -112,6 +112,10 @@ class GpuLower : public NonCopyable {
     return std::const_pointer_cast<const ComputeAtMap>(compute_at_map_);
   }
 
+  bool hasIdModel() const {
+    return id_model_.get() != nullptr;
+  }
+
   IdModel& idModel() {
     NVF_ERROR(id_model_.get());
     return *id_model_;

diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h
@@ -92,8 +92,13 @@ class TensorIndexer {
   // should not affect actual index exprs.
   // Returns non-const reference because indexing may create new domains and
   // need to update the graph.
+
+  static IdMappingMode traversalGraphType() {
+    return IdMappingMode::ALMOSTEXACT;
+  }
+
   ValGraph& traversalGraph() const {
-    return id_model_.idGraph(IdMappingMode::ALMOSTEXACT);
+    return id_model_.idGraph(traversalGraphType());
   }
 
   // Traverse exprs and set allocation info for each tensor

diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
@@ -8863,6 +8863,33 @@ TEST_F(NVFuserTest, BestEffortReplayWithMismatchedRootToLogical) {
       /*error_on_failure=*/false);
 }
 
+TEST_F(NVFuserTest, RAWSync) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv2->merge(0);
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  // Since tv2 is not inlined and tv2 and tv3 are both parallelized,
+  // tv2 as a producer of tv3 requires a synchronization with tv2
+  // placed on shared memory. Lowering the fusion should fail.
+  EXPECT_THAT(
+      [&]() { GpuLower(&fusion).run(); },
+      testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
+          "Producer is required to be in Global or Shared Memory based on parallelization strategy. RAW flags: (threadIdx.x)")));
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp
@@ -2701,11 +2701,9 @@ TEST_F(GpuViewTest, FusionMismatchingReshape) {
   // Parallelize all tensors as [BIDx, TIDx]
   schedule.merge(0);
   schedule.split(0, 128);
-#if 0
-  // TODO: sync analysis is not working yet
+
   schedule.parallelize(0, ParallelType::BIDx);
   schedule.parallelize(1, ParallelType::TIDx);
-#endif
 
   // Now, tv5 looks like:
   //