Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add slice tests to demonstrate manual scheduling #2898

Merged
merged 21 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 112 additions & 25 deletions csrc/device_lower/analysis/sync_information.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
// clang-format on
#include <device_lower/analysis/index_compute.h>
#include <device_lower/lower2device.h>
#include <id_model/indexing.h>
#include <id_model/indexing_utils.h>
#include <instrumentation.h>
#include <ir/utils.h>
#include <transform_iter.h>

#include <device_lower/analysis/sync_information.h>

Expand Down Expand Up @@ -492,7 +495,7 @@ SyncMap::SyncMap(Fusion* fusion) {
producer_redundant_types & (~producer_redundant_use_types);

for (const auto producer_i : c10::irange(producer->nDims())) {
auto producer_axis = producer->axis(producer_i);
auto producer_axis = producer->getLoopDomain().at(producer_i);
auto producer_ptype =
ca_map->getConcreteMappedID(producer_axis, IdMappingMode::LOOP)
->getParallelType();
Expand All @@ -516,7 +519,7 @@ SyncMap::SyncMap(Fusion* fusion) {
std::vector<IterDomain*> consumer_parallel_ids(
ParallelTypeBitmap::kNumParallelTypes, nullptr);
for (const auto consumer_i : c10::irange(consumer->nDims())) {
auto consumer_axis = consumer->axis(consumer_i);
auto consumer_axis = consumer->getLoopDomain().at(consumer_i);
auto consumer_ptype =
ca_map->getConcreteMappedID(consumer_axis, IdMappingMode::LOOP)
->getParallelType();
Expand All @@ -541,6 +544,23 @@ SyncMap::SyncMap(Fusion* fusion) {

ProducerConsumerIndexingInfoCache indexing_info(producer, consumer);

// P2C map is required when using the IdModel-based analysis
const std::unordered_map<IterDomain*, IterDomain*>
p2c_map_no_forwarding = GpuLower::current()->hasIdModel()
? BestEffortReplay(
consumer->getLoopDomain(),
producer->getLoopDomain(),
PairwiseLogicalDomainMap(producer, consumer)
.mapProducerToConsumer(),
/*replay_forward_id_map=*/{},
/*target_forward_id_map=*/{},
/*skip_replay_swizzle=*/false,
/*skip_target_swizzle=*/false,
/*skip_resize=*/false,
/*error_on_failure=*/false)
.getReplay()
: std::unordered_map<IterDomain*, IterDomain*>{};

// At this point each parallel type that's present in the consumer or
// the producer will be present in their corresponding `_parallel_ids`
// map going from parallel index type (only size 6 for grid/block dims)
Expand Down Expand Up @@ -653,6 +673,7 @@ SyncMap::SyncMap(Fusion* fusion) {
producer->getLogicalDomain(), {p_id})
.empty()) {
raw_dims.set(producer_ptype);
continue;
}
}

Expand All @@ -662,30 +683,96 @@ SyncMap::SyncMap(Fusion* fusion) {
continue;
}

// When the producer is parallelized, the producer and the
// consumer must use the same index with the same parallel
// type. Otherwise, a sync is required. This is not the case
// when this op is a parallel broadcast.

if (producer_parallel_bcast) {
// As long as they are permissively mapped using the same
// parallel type, no communication is required
if (producer_ptype == consumer_ptype &&
ca_map->areMapped(p_id, c_id, IdMappingMode::PERMISSIVE)) {
continue;
// Use the IdModel loop promotion when available. This is
// required for tensors with non-trivial loop domains
if (GpuLower::current()->hasIdModel()) {
if (producer_ptype == consumer_ptype) {
// If both domains are promoted to the same domain
// (i.e., mapped in the AlmostExact graph), they should
// be no cross-thread dependency
const auto& id_model = GpuLower::current()->idModel();
auto producer_loop_id =
indexing_utils::getLoopPromotion(p_id, id_model);
auto consumer_loop_id =
indexing_utils::getLoopPromotion(c_id, id_model);
const auto& indexing_traveral_graph =
id_model.idGraph(TensorIndexer::traversalGraphType());
if (indexing_traveral_graph.disjointValSets().strictAreMapped(
producer_loop_id, consumer_loop_id)) {
continue;
} else {
// If the producer ID is a broadcast, it does not
// require synchronization even when the producer and
// consumer domains are not promoted to the same
// group. For example,
//
// tv0: [i0]
// tv1: [b1]
// tv2 = tv1
// tv3 = tv0 + tv2
//
// tv2->axis(0)->parallelize(ParallelType::TIDx);
// tv3->axis(0)->parallelize(ParallelType::TIDx);
//
// Assume that there's no inlining. Since it isn't
// inlined, the loop domain of tv2 is not mapped with
// that of tv3, thus the avove condition won't
// hit. Still, since tv2 will be executed by all TIDx
// threads independently, there's no need of
// synchronization.
//
// Consider a similar case like below:
//
// tv0: [i0, i1]
// tv1: [i2, b3]
// tv2 = tv1
// tv3 = tv0 + tv2
//
// tv2->merge(0, 1);
// tv3->merge(0, 1);
// tv2->axis(0)->parallelize(ParallelType::TIDx);
// tv3->axis(0)->parallelize(ParallelType::TIDx);
//
// This case does require a synchronization since for
// tv2, TIDx will be used to parallelize the outer
// domain only, whereas for tv3 it is mapped to the
// merged domain of the outer and inner domains. In
// other words, if a broadcast becomes non-broadcast
// by getting merged with a non-broadcast domain, it
// requires a synchronization.
if (p_id->isBroadcast()) {
if (auto it = p2c_map_no_forwarding.find(p_id);
it != p2c_map_no_forwarding.end() && it->second == c_id) {
continue;
}
}
}
}
} else {
// When the producer is parallelized, the producer and the
// consumer must use the same index with the same parallel
// type. Otherwise, a sync is required. This is not the case
// when this op is a parallel broadcast.
if (producer_parallel_bcast) {
// As long as they are permissively mapped using the same
// parallel type, no communication is required
if (producer_ptype == consumer_ptype &&
ca_map->areMapped(p_id, c_id, IdMappingMode::PERMISSIVE)) {
continue;
}
// Can this happen?
NVF_ERROR(
false,
"Unexpected case. Producer: ",
producer->toString(),
", consumer: ",
consumer->toString());
}
if (producer_ptype == consumer_ptype) {
if (useSameIndex(producer, p_id, consumer, c_id, indexing_info)) {
continue;
}
}
// Can this happen?
NVF_ERROR(
false,
"Unexpected case. Producer: ",
producer->toString(),
", consumer: ",
consumer->toString());
}

if (producer_ptype == consumer_ptype &&
useSameIndex(producer, p_id, consumer, c_id, indexing_info)) {
continue;
}

raw_dims.set(producer_ptype);
Expand Down
6 changes: 2 additions & 4 deletions csrc/device_lower/lower2device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,8 @@ void GpuLower::analysis(Fusion* fusion) {
// functionality should be affected. New IterDomains may be created,
// so it is expected that generated code may use diffrent variable
// names
if (this->requiresIdModel() || isOptionEnabled(EnableOption::IdModel)) {
if (true || this->requiresIdModel() ||
isOptionEnabled(EnableOption::IdModel)) {
// Enable validation in the DEBUG build mode
#ifdef NDEBUG
// Not DEBUG build
Expand Down Expand Up @@ -468,9 +469,6 @@ void GpuLower::analysis(Fusion* fusion) {
validateSwizzle(fusion_);
dumpExprsIfEnabled(fusion_->exprs(), "validateSwizzle");

validateResize(fusion_);
dumpExprsIfEnabled(fusion_->exprs(), "validateResize");

validateReductions(fusion_);
dumpExprsIfEnabled(fusion_->exprs(), "validateReductions");

Expand Down
4 changes: 4 additions & 0 deletions csrc/device_lower/lower2device.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ class GpuLower : public NonCopyable {
return std::const_pointer_cast<const ComputeAtMap>(compute_at_map_);
}

bool hasIdModel() const {
return id_model_.get() != nullptr;
}

IdModel& idModel() {
NVF_ERROR(id_model_.get());
return *id_model_;
Expand Down
19 changes: 0 additions & 19 deletions csrc/device_lower/validation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1058,25 +1058,6 @@ void validateLookupTV(Fusion* fusion) {
}
}

void validateResize(Fusion* fusion) {
auto fusion_vals = fusion->usedMathVals();
for (auto tv : ir_utils::filterByType<TensorView>(fusion_vals)) {
// Make sure resize is only used as part of root to logical transformations
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No longer holds

auto rf_to_loop_exprs = StmtSort::getExprsBetween(
{tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()},
{tv->getLoopDomain().begin(), tv->getLoopDomain().end()});

NVF_ERROR(
std::none_of(
rf_to_loop_exprs.begin(),
rf_to_loop_exprs.end(),
[](Expr* expr) { return expr->isA<Resize>(); }),
"Invalid use of resize detected with ",
tv->toString(),
". Resize may only be used as part of root to logical transformations.");
}
}

void validateReductions(Fusion* fusion) {
for (auto rop : ir_utils::getOpsOfType<ReductionOp>(fusion)) {
auto in = rop->in()->as<TensorView>();
Expand Down
3 changes: 0 additions & 3 deletions csrc/device_lower/validation.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,6 @@ void validateGroupedReductions(Fusion* fusion);
//! Validate all of the lookup TVs are ensured to be fusion inputs
void validateLookupTV(Fusion* fusion);

//! Validate resize usage
void validateResize(Fusion* fusion);

//! Check that there are no reductions over unexpanded broadcasts
void validateReductions(Fusion* fusion);

Expand Down
7 changes: 6 additions & 1 deletion csrc/id_model/indexing.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,13 @@ class TensorIndexer {
// should not affect actual index exprs.
// Returns non-const reference because indexing may create new domains and
// need to update the graph.

static IdMappingMode traversalGraphType() {
return IdMappingMode::ALMOSTEXACT;
}

ValGraph& traversalGraph() const {
return id_model_.idGraph(IdMappingMode::ALMOSTEXACT);
return id_model_.idGraph(traversalGraphType());
}

// Traverse exprs and set allocation info for each tensor
Expand Down
27 changes: 27 additions & 0 deletions tests/cpp/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8863,6 +8863,33 @@ TEST_F(NVFuserTest, BestEffortReplayWithMismatchedRootToLogical) {
/*error_on_failure=*/false);
}

TEST_F(NVFuserTest, RAWSync) {
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = makeSymbolicTensor(1);
fusion.addInput(tv1);

auto tv2 = broadcast(tv1, {false, true});
auto tv3 = add(tv0, tv2);
fusion.addOutput(tv3);

tv3->merge(0);
tv2->merge(0);
tv3->axis(0)->parallelize(ParallelType::TIDx);
tv2->axis(0)->parallelize(ParallelType::TIDx);

// Since tv2 is not inlined and tv2 and tv3 are both parallelized,
// tv2 as a producer of tv3 requires a synchronization with tv2
// placed on shared memory. Lowering the fusion should fail.
EXPECT_THAT(
[&]() { GpuLower(&fusion).run(); },
testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
"Producer is required to be in Global or Shared Memory based on parallelization strategy. RAW flags: (threadIdx.x)")));
}

// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser
4 changes: 1 addition & 3 deletions tests/cpp/test_gpu_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2701,11 +2701,9 @@ TEST_F(GpuViewTest, FusionMismatchingReshape) {
// Parallelize all tensors as [BIDx, TIDx]
schedule.merge(0);
schedule.split(0, 128);
#if 0
// TODO: sync analysis is not working yet

schedule.parallelize(0, ParallelType::BIDx);
schedule.parallelize(1, ParallelType::TIDx);
#endif

// Now, tv5 looks like:
//
Expand Down
Loading
Loading