NVIDIA · naoyam · Sep 5, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/csrc/device_lower/analysis/sync_information.cpp b/csrc/device_lower/analysis/sync_information.cpp
@@ -7,8 +7,11 @@
 // clang-format on
 #include <device_lower/analysis/index_compute.h>
 #include <device_lower/lower2device.h>
+#include <id_model/indexing.h>
+#include <id_model/indexing_utils.h>
 #include <instrumentation.h>
 #include <ir/utils.h>
+#include <transform_iter.h>
 
 #include <device_lower/analysis/sync_information.h>
 
@@ -492,7 +495,7 @@ SyncMap::SyncMap(Fusion* fusion) {
           producer_redundant_types & (~producer_redundant_use_types);
 
       for (const auto producer_i : c10::irange(producer->nDims())) {
-        auto producer_axis = producer->axis(producer_i);
+        auto producer_axis = producer->getLoopDomain().at(producer_i);
         auto producer_ptype =
             ca_map->getConcreteMappedID(producer_axis, IdMappingMode::LOOP)
                 ->getParallelType();
@@ -516,7 +519,7 @@ SyncMap::SyncMap(Fusion* fusion) {
         std::vector<IterDomain*> consumer_parallel_ids(
             ParallelTypeBitmap::kNumParallelTypes, nullptr);
         for (const auto consumer_i : c10::irange(consumer->nDims())) {
-          auto consumer_axis = consumer->axis(consumer_i);
+          auto consumer_axis = consumer->getLoopDomain().at(consumer_i);
           auto consumer_ptype =
               ca_map->getConcreteMappedID(consumer_axis, IdMappingMode::LOOP)
                   ->getParallelType();
@@ -541,6 +544,23 @@ SyncMap::SyncMap(Fusion* fusion) {
 
         ProducerConsumerIndexingInfoCache indexing_info(producer, consumer);
 
+        // P2C map is required when using the IdModel-based analysis
+        const std::unordered_map<IterDomain*, IterDomain*>
+            p2c_map_no_forwarding = GpuLower::current()->hasIdModel()
+            ? BestEffortReplay(
+                  consumer->getLoopDomain(),
+                  producer->getLoopDomain(),
+                  PairwiseLogicalDomainMap(producer, consumer)
+                      .mapProducerToConsumer(),
+                  /*replay_forward_id_map=*/{},
+                  /*target_forward_id_map=*/{},
+                  /*skip_replay_swizzle=*/false,
+                  /*skip_target_swizzle=*/false,
+                  /*skip_resize=*/false,
+                  /*error_on_failure=*/false)
+                  .getReplay()
+            : std::unordered_map<IterDomain*, IterDomain*>{};
+
         // At this point each parallel type that's present in the consumer or
         // the producer will be present in their corresponding `_parallel_ids`
         // map going from parallel index type (only size 6 for grid/block dims)
@@ -653,6 +673,7 @@ SyncMap::SyncMap(Fusion* fusion) {
                      producer->getLogicalDomain(), {p_id})
                      .empty()) {
               raw_dims.set(producer_ptype);
+              continue;
             }
           }
 
@@ -662,30 +683,96 @@ SyncMap::SyncMap(Fusion* fusion) {
             continue;
           }
 
-          // When the producer is parallelized, the producer and the
-          // consumer must use the same index with the same parallel
-          // type. Otherwise, a sync is required. This is not the case
-          // when this op is a parallel broadcast.
-
-          if (producer_parallel_bcast) {
-            // As long as they are permissively mapped using the same
-            // parallel type, no communication is required
-            if (producer_ptype == consumer_ptype &&
-                ca_map->areMapped(p_id, c_id, IdMappingMode::PERMISSIVE)) {
-              continue;
+          // Use the IdModel loop promotion when available. This is
+          // required for tensors with non-trivial loop domains
+          if (GpuLower::current()->hasIdModel()) {
+            if (producer_ptype == consumer_ptype) {
+              // If both domains are promoted to the same domain
+              // (i.e., mapped in the AlmostExact graph), they should
+              // be no cross-thread dependency
+              const auto& id_model = GpuLower::current()->idModel();
+              auto producer_loop_id =
+                  indexing_utils::getLoopPromotion(p_id, id_model);
+              auto consumer_loop_id =
+                  indexing_utils::getLoopPromotion(c_id, id_model);
+              const auto& indexing_traveral_graph =
+                  id_model.idGraph(TensorIndexer::traversalGraphType());
+              if (indexing_traveral_graph.disjointValSets().strictAreMapped(
+                      producer_loop_id, consumer_loop_id)) {
+                continue;
+              } else {
+                // If the producer ID is a broadcast, it does not
+                // require synchronization even when the producer and
+                // consumer domains are not promoted to the same
+                // group. For example,
+                //
+                // tv0: [i0]
+                // tv1: [b1]
+                // tv2 = tv1
+                // tv3 = tv0 + tv2
+                //
+                // tv2->axis(0)->parallelize(ParallelType::TIDx);
+                // tv3->axis(0)->parallelize(ParallelType::TIDx);
+                //
+                // Assume that there's no inlining. Since it isn't
+                // inlined, the loop domain of tv2 is not mapped with
+                // that of tv3, thus the avove condition won't
+                // hit. Still, since tv2 will be executed by all TIDx
+                // threads independently, there's no need of
+                // synchronization.
+                //
+                // Consider a similar case like below:
+                //
+                // tv0: [i0, i1]
+                // tv1: [i2, b3]
+                // tv2 = tv1
+                // tv3 = tv0 + tv2
+                //
+                // tv2->merge(0, 1);
+                // tv3->merge(0, 1);
+                // tv2->axis(0)->parallelize(ParallelType::TIDx);
+                // tv3->axis(0)->parallelize(ParallelType::TIDx);
+                //
+                // This case does require a synchronization since for
+                // tv2, TIDx will be used to parallelize the outer
+                // domain only, whereas for tv3 it is mapped to the
+                // merged domain of the outer and inner domains. In
+                // other words, if a broadcast becomes non-broadcast
+                // by getting merged with a non-broadcast domain, it
+                // requires a synchronization.
+                if (p_id->isBroadcast()) {
+                  if (auto it = p2c_map_no_forwarding.find(p_id);
+                      it != p2c_map_no_forwarding.end() && it->second == c_id) {
+                    continue;
+                  }
+                }
+              }
+            }
+          } else {
+            // When the producer is parallelized, the producer and the
+            // consumer must use the same index with the same parallel
+            // type. Otherwise, a sync is required. This is not the case
+            // when this op is a parallel broadcast.
+            if (producer_parallel_bcast) {
+              // As long as they are permissively mapped using the same
+              // parallel type, no communication is required
+              if (producer_ptype == consumer_ptype &&
+                  ca_map->areMapped(p_id, c_id, IdMappingMode::PERMISSIVE)) {
+                continue;
+              }
+              // Can this happen?
+              NVF_ERROR(
+                  false,
+                  "Unexpected case. Producer: ",
+                  producer->toString(),
+                  ", consumer: ",
+                  consumer->toString());
+            }
+            if (producer_ptype == consumer_ptype) {
+              if (useSameIndex(producer, p_id, consumer, c_id, indexing_info)) {
+                continue;
+              }
             }
-            // Can this happen?
-            NVF_ERROR(
-                false,
-                "Unexpected case. Producer: ",
-                producer->toString(),
-                ", consumer: ",
-                consumer->toString());
-          }
-
-          if (producer_ptype == consumer_ptype &&
-              useSameIndex(producer, p_id, consumer, c_id, indexing_info)) {
-            continue;
           }
 
           raw_dims.set(producer_ptype);

diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
@@ -415,7 +415,8 @@ void GpuLower::analysis(Fusion* fusion) {
   // functionality should be affected. New IterDomains may be created,
   // so it is expected that generated code may use diffrent variable
   // names
-  if (this->requiresIdModel() || isOptionEnabled(EnableOption::IdModel)) {
+  if (true || this->requiresIdModel() ||
+      isOptionEnabled(EnableOption::IdModel)) {
     // Enable validation in the DEBUG build mode
 #ifdef NDEBUG
     // Not DEBUG build
@@ -468,9 +469,6 @@ void GpuLower::analysis(Fusion* fusion) {
   validateSwizzle(fusion_);
   dumpExprsIfEnabled(fusion_->exprs(), "validateSwizzle");
 
-  validateResize(fusion_);
-  dumpExprsIfEnabled(fusion_->exprs(), "validateResize");
-
   validateReductions(fusion_);
   dumpExprsIfEnabled(fusion_->exprs(), "validateReductions");
 

diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h
@@ -112,6 +112,10 @@ class GpuLower : public NonCopyable {
     return std::const_pointer_cast<const ComputeAtMap>(compute_at_map_);
   }
 
+  bool hasIdModel() const {
+    return id_model_.get() != nullptr;
+  }
+
   IdModel& idModel() {
     NVF_ERROR(id_model_.get());
     return *id_model_;

diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp
@@ -1058,25 +1058,6 @@ void validateLookupTV(Fusion* fusion) {
   }
 }
 
-void validateResize(Fusion* fusion) {
-  auto fusion_vals = fusion->usedMathVals();
-  for (auto tv : ir_utils::filterByType<TensorView>(fusion_vals)) {
-    // Make sure resize is only used as part of root to logical transformations
-    auto rf_to_loop_exprs = StmtSort::getExprsBetween(
-        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()},
-        {tv->getLoopDomain().begin(), tv->getLoopDomain().end()});
-
-    NVF_ERROR(
-        std::none_of(
-            rf_to_loop_exprs.begin(),
-            rf_to_loop_exprs.end(),
-            [](Expr* expr) { return expr->isA<Resize>(); }),
-        "Invalid use of resize detected with ",
-        tv->toString(),
-        ". Resize may only be used as part of root to logical transformations.");
-  }
-}
-
 void validateReductions(Fusion* fusion) {
   for (auto rop : ir_utils::getOpsOfType<ReductionOp>(fusion)) {
     auto in = rop->in()->as<TensorView>();

diff --git a/csrc/device_lower/validation.h b/csrc/device_lower/validation.h
@@ -68,9 +68,6 @@ void validateGroupedReductions(Fusion* fusion);
 //! Validate all of the lookup TVs are ensured to be fusion inputs
 void validateLookupTV(Fusion* fusion);
 
-//! Validate resize usage
-void validateResize(Fusion* fusion);
-
 //! Check that there are no reductions over unexpanded broadcasts
 void validateReductions(Fusion* fusion);
 

diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h
@@ -92,8 +92,13 @@ class TensorIndexer {
   // should not affect actual index exprs.
   // Returns non-const reference because indexing may create new domains and
   // need to update the graph.
+
+  static IdMappingMode traversalGraphType() {
+    return IdMappingMode::ALMOSTEXACT;
+  }
+
   ValGraph& traversalGraph() const {
-    return id_model_.idGraph(IdMappingMode::ALMOSTEXACT);
+    return id_model_.idGraph(traversalGraphType());
   }
 
   // Traverse exprs and set allocation info for each tensor

diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
@@ -8863,6 +8863,33 @@ TEST_F(NVFuserTest, BestEffortReplayWithMismatchedRootToLogical) {
       /*error_on_failure=*/false);
 }
 
+TEST_F(NVFuserTest, RAWSync) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv2->merge(0);
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  // Since tv2 is not inlined and tv2 and tv3 are both parallelized,
+  // tv2 as a producer of tv3 requires a synchronization with tv2
+  // placed on shared memory. Lowering the fusion should fail.
+  EXPECT_THAT(
+      [&]() { GpuLower(&fusion).run(); },
+      testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
+          "Producer is required to be in Global or Shared Memory based on parallelization strategy. RAW flags: (threadIdx.x)")));
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp
@@ -2701,11 +2701,9 @@ TEST_F(GpuViewTest, FusionMismatchingReshape) {
   // Parallelize all tensors as [BIDx, TIDx]
   schedule.merge(0);
   schedule.split(0, 128);
-#if 0
-  // TODO: sync analysis is not working yet
+
   schedule.parallelize(0, ParallelType::BIDx);
   schedule.parallelize(1, ParallelType::TIDx);
-#endif
 
   // Now, tv5 looks like:
   //