Skip to content

Commit

Permalink
Merge branch 'main' into slice_manual_scheduling_tests
Browse files Browse the repository at this point in the history
  • Loading branch information
naoyam committed Sep 5, 2024
2 parents e475b15 + 1158543 commit d7ab0bf
Showing 1 changed file with 67 additions and 48 deletions.
115 changes: 67 additions & 48 deletions csrc/device_lower/analysis/sync_information.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,9 +687,27 @@ SyncMap::SyncMap(Fusion* fusion) {
// required for tensors with non-trivial loop domains
if (GpuLower::current()->hasIdModel()) {
if (producer_ptype == consumer_ptype) {
// If both domains are promoted to the same domain
// (i.e., mapped in the AlmostExact graph), they should
// be no cross-thread dependency
// Case 1:
// Producer loop ID: non-broadcast
// Consumer loop ID: non-broadcast
// -> No sync if they are exactly mapped. This case is covered by
// the promotion check.
//
// Case 2:
// Producer loop ID: broadcast (which may be produced by
// merging multiple broadcast domains)
// Consumer loop ID: non-broadcast
// -> They are not exactly mapped but sync is not necessary as
// discussed below.
//
// Case 3:
// Producer loop ID: non-broadcast
// Consumer loop ID: non-broadcast
// -> Sync required if they are not exactly mapped, even when they
// are mapped by the best effort replay. (See
// NVFuserTest.RAWSync for a concrete repro).

// Case 1
const auto& id_model = GpuLower::current()->idModel();
auto producer_loop_id =
indexing_utils::getLoopPromotion(p_id, id_model);
Expand All @@ -700,51 +718,52 @@ SyncMap::SyncMap(Fusion* fusion) {
if (indexing_traveral_graph.disjointValSets().strictAreMapped(
producer_loop_id, consumer_loop_id)) {
continue;
} else {
// If the producer ID is a broadcast, it does not
// require synchronization even when the producer and
// consumer domains are not promoted to the same
// group. For example,
//
// tv0: [i0]
// tv1: [b1]
// tv2 = tv1
// tv3 = tv0 + tv2
//
// tv2->axis(0)->parallelize(ParallelType::TIDx);
// tv3->axis(0)->parallelize(ParallelType::TIDx);
//
// Assume that there's no inlining. Since it isn't
// inlined, the loop domain of tv2 is not mapped with
// that of tv3, thus the avove condition won't
// hit. Still, since tv2 will be executed by all TIDx
// threads independently, there's no need of
// synchronization.
//
// Consider a similar case like below:
//
// tv0: [i0, i1]
// tv1: [i2, b3]
// tv2 = tv1
// tv3 = tv0 + tv2
//
// tv2->merge(0, 1);
// tv3->merge(0, 1);
// tv2->axis(0)->parallelize(ParallelType::TIDx);
// tv3->axis(0)->parallelize(ParallelType::TIDx);
//
// This case does require a synchronization since for
// tv2, TIDx will be used to parallelize the outer
// domain only, whereas for tv3 it is mapped to the
// merged domain of the outer and inner domains. In
// other words, if a broadcast becomes non-broadcast
// by getting merged with a non-broadcast domain, it
// requires a synchronization.
if (p_id->isBroadcast()) {
if (auto it = p2c_map_no_forwarding.find(p_id);
it != p2c_map_no_forwarding.end() && it->second == c_id) {
continue;
}
}

// Case 2
// If the producer ID is a broadcast, it does not
// require synchronization even when the producer and
// consumer domains are not promoted to the same
// group. For example,
//
// tv0: [i0]
// tv1: [b1]
// tv2 = tv1
// tv3 = tv0 + tv2
//
// tv2->axis(0)->parallelize(ParallelType::TIDx);
// tv3->axis(0)->parallelize(ParallelType::TIDx);
//
// Assume that there's no inlining. Since it isn't
// inlined, the loop domain of tv2 is not mapped with
// that of tv3, thus the avove condition won't
// hit. Still, since tv2 will be executed by all TIDx
// threads independently, there's no need of
// synchronization.
//
// Consider a similar case like below:
//
// tv0: [i0, i1]
// tv1: [i2, b3]
// tv2 = tv1
// tv3 = tv0 + tv2
//
// tv2->merge(0, 1);
// tv3->merge(0, 1);
// tv2->axis(0)->parallelize(ParallelType::TIDx);
// tv3->axis(0)->parallelize(ParallelType::TIDx);
//
// This case does require a synchronization since for
// tv2, TIDx will be used to parallelize the outer
// domain only, whereas for tv3 it is mapped to the
// merged domain of the outer and inner domains. In
// other words, if a broadcast becomes non-broadcast
// by getting merged with a non-broadcast domain, it
// requires a synchronization.
if (p_id->isBroadcast()) {
if (auto it = p2c_map_no_forwarding.find(p_id);
it != p2c_map_no_forwarding.end() && it->second == c_id) {
continue;
}
}
}
Expand Down

0 comments on commit d7ab0bf

Please sign in to comment.