Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Indexing changes for TMA Circular Buffering #2825

Merged
merged 4 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions csrc/device_lower/lower2device.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,32 @@ class GpuLower : public NonCopyable {
return ldst_mbarrier_map_;
}

std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap() {
return ldst_mbarrier_token_map_;
}

const std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap()
const {
return ldst_mbarrier_token_map_;
}

std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() {
return mbarrier_token_smem_alloc_set_;
}

const std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() const {
return mbarrier_token_smem_alloc_set_;
}

std::unordered_map<const Expr*, kir::TensorIndex*>& ldstMBarrierIndexMap() {
return ldst_mbarrier_index_map_;
}

const std::unordered_map<const Expr*, kir::TensorIndex*>&
ldstMBarrierIndexMap() const {
return ldst_mbarrier_index_map_;
}

bool isNvFuserZeroEnabled() {
if (isOptionDisabled(DisableOption::MagicZero)) {
return false;
Expand Down Expand Up @@ -359,6 +385,19 @@ class GpuLower : public NonCopyable {
//! for vectorization.
std::vector<std::pair<const Val*, std::string>> validations_;

// Keep track of placeholders for tokens returned by arrive/expected tx
// mbarrier operations for each load/store operation that requires such
// synchronization
std::unordered_map<const Expr*, TensorView*> ldst_mbarrier_token_map_;

// Collection of kir::Allocate for smem buffers used for mbarrier and token
// objects from cpAsyncBulk synchronization
std::unordered_set<const Expr*> mbarrier_token_smem_alloc_set_;

// Keep track what mbarrier object is used in load/store operation that
// requires such synchronization, required by indexing pass
std::unordered_map<const Expr*, kir::TensorIndex*> ldst_mbarrier_index_map_;

Fusion* fusion_ = nullptr;

// A temporary flag which is true if the fusion uses any feature that requires
Expand Down
49 changes: 38 additions & 11 deletions csrc/device_lower/pass/alias_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -866,20 +866,47 @@ class AllocationInfoMap : private kir::IrVisitor {
}

void collectLivenessInfoOfExprMBarrier(Expr* expr) {
const auto expr_pos = scope_map_.getExprPos(expr);
int64_t expr_pos = scope_map_.getExprPos(expr);

auto mark_liveness = [&expr_pos, this](TensorView* tv, bool is_write) {
AllocationInfo* alloc_info = getAllocInfoFromTV(tv);
if (is_write) {
alloc_info->inner_live_interval->markWrite(expr_pos);
} else {
alloc_info->inner_live_interval->markRead(expr_pos);
}
ScopeInfo* outer_loop_info = ascendLoopNestToSameLevelAs(alloc_info);
int64_t outer_pos =
outer_loop_info ? outer_loop_info->start_pos : expr_pos;
if (is_write) {
alloc_info->outer_live_interval->markWrite(outer_pos);
} else {
alloc_info->outer_live_interval->markRead(outer_pos);
}
};

// The liveness of the mbarrier and its token are mapped together.
// The token is the mbarrier state of the last phase.
if (auto init = dynamic_cast<kir::MBarrierInit*>(expr)) {
auto alloc_info = getAllocInfoFromTV(init->mbarrier()->as<TensorView>());
alloc_info->inner_live_interval->markWrite(expr_pos);
auto outer_loop_info = ascendLoopNestToSameLevelAs(alloc_info);
auto write_pos = outer_loop_info ? outer_loop_info->start_pos : expr_pos;
alloc_info->outer_live_interval->markWrite(write_pos);
mark_liveness(init->mbarrier()->as<TensorView>(), /*is_write=*/true);

// Register start of lifetime for a mbarrier token returned by
// MBarrierArriveExpectTx and MBarrierArrive.
if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
mark_liveness(
GpuLower::current()->ldstMBarrierTokenMap()[expr],
/*is_write=*/true);
}
} else if (auto inval = dynamic_cast<kir::MBarrierInvalidate*>(expr)) {
auto alloc_info = getAllocInfoFromTV(inval->mbarrier()->as<TensorView>());
alloc_info->inner_live_interval->markRead(expr_pos);
auto outer_loop_info = ascendLoopNestToSameLevelAs(alloc_info);
auto write_pos = outer_loop_info ? outer_loop_info->start_pos : expr_pos;
alloc_info->outer_live_interval->markRead(write_pos);
mark_liveness(inval->mbarrier()->as<TensorView>(), /*is_write=*/false);

// Register end of lifetime for a mbarrier token returned by
// returned by MBarrierArriveExpectTx and MBarrierArrive
if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
mark_liveness(
GpuLower::current()->ldstMBarrierTokenMap()[expr],
/*is_write=*/false);
}
}
}

Expand Down
129 changes: 102 additions & 27 deletions csrc/device_lower/pass/allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ class AllocationInserter : public kir::ExprMutator {
return;
}

int64_t circular_buffer_depth = 1;

// Found where the allocation needs to be inserted

for (const auto i : c10::irange(expr->outputs().size())) {
Expand Down Expand Up @@ -345,6 +347,16 @@ class AllocationInserter : public kir::ExprMutator {
auto alloc_expr = createAllocExpr(allocation, is_output);
auto init_expr = createInitExpr(allocation, init);

// Find the largest circular buffer depth; Used for tma buffer allocation
if (out_tv->isCircularBuffered() && circular_buffer_depth == 1) {
circular_buffer_depth = out_tv->circularBufferDepth();
}
NVF_ERROR(
circular_buffer_depth == 1 ||
circular_buffer_depth == out_tv->circularBufferDepth(),
"Expected all output TensorViews for the same expression ",
"to have the same circular_buffer_depth");

// Write information to GPULower
writeInfoToGPULower(allocation, alloc_expr);

Expand Down Expand Up @@ -378,33 +390,96 @@ class AllocationInserter : public kir::ExprMutator {
// solution, we should remove this after we have a better way to handle
// synchronizations for cp.async.bulk.
if (ir_utils::isCpAsyncBulkLoad(expr)) {
// create and allocate a memory barrier
TensorView* mbarrier = TensorViewBuilder()
.shape(std::vector<int64_t>{})
.dtype(DataType::UInt)
.contiguity(true)
.build();
mbarrier->setMemoryType(MemoryType::Shared);
auto mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
mbarrier,
simplifyExpr(SimplifyingIrBuilder::maybeCastExpr(
DataType::UInt32,
lower_utils::getNumThreadsInTensorView(
expr->output(0)->as<TensorView>()))));
auto sync_init = IrBuilder::create<kir::BlockSync>();
auto mbarrier_inval =
IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);
auto sync_inval = IrBuilder::create<kir::BlockSync>();

kir::Allocate* mbarrier_alloc =
IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);
Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();
registerInsertBefore(expr, mbarrier_alloc, expr_scope);
registerInsertBefore(expr, mbarrier_init, expr_scope);
registerInsertBefore(expr, sync_init, expr_scope);
registerInsertAfter(expr, mbarrier_inval, expr_scope);
registerInsertAfter(expr, sync_inval, expr_scope);
GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
if (circular_buffer_depth > 1) {
// Create and allocate a memory barrier. If this is a circular buffer,
// then allocate an array of mbarier objects. mbarrier::init and
// mbarrier::inval will be updated in circular buffering pass, but we
// add them here to handle shared memory correctly in alias memory pass.
TensorView* mbarrier =
TensorViewBuilder()
.shape(std::vector<int64_t>{circular_buffer_depth})
.dtype(DataType::UInt)
.contiguity(true)
.build();
mbarrier->setMemoryType(MemoryType::Shared);

// The wait condition for mbarrier is a single thread and the expected
// number of transaction bytes
kir::MBarrierInit* mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
mbarrier, expr->container()->oneVal(DataType::UInt32));

kir::Allocate* mbarrier_alloc =
IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);

Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();

kir::MBarrierInvalidate* mbarrier_inval =
IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);

// For circular buffers we need to prepare a placeholder for the
// tokens created by 'MBarrierArriveExpectTx' IR node. The tokens are
// placed in shared memory and used by threads in a block.
TensorView* mbarrier_tokens =
TensorViewBuilder()
.shape(std::vector<int64_t>{circular_buffer_depth})
.dtype(DataType::UInt)
.contiguity(true)
.build();
mbarrier_tokens->setMemoryType(MemoryType::Shared);

kir::Allocate* mbarrier_tokens_alloc = IrBuilder::create<kir::Allocate>(
mbarrier_tokens, MemoryType::Shared);

// NOTE: Block sync ir node is not added here. It will be added in the
// circular buffering pass
registerInsertBefore(expr, mbarrier_tokens_alloc, expr_scope);
registerInsertBefore(expr, mbarrier_alloc, expr_scope);
registerInsertBefore(expr, mbarrier_init, expr_scope);
registerInsertAfter(expr, mbarrier_inval, expr_scope);

// Map LoadStoreOp expression to ir nodes created in this pass
GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
GpuLower::current()->ldstMBarrierTokenMap()[expr] = mbarrier_tokens;
// Register tokens placeholder for MBarrierInit and MBarrierInvalidate,
// needed to manage life time of smem buffor in alias memory
GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_init] =
mbarrier_tokens;
GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_inval] =
mbarrier_tokens;
// Keep track of kir::Allocate for mBarrier and token objects,
// to simplify circular buffering pass logic
GpuLower::current()->mBarrierTokenSmemAllocSet().insert(mbarrier_alloc);
GpuLower::current()->mBarrierTokenSmemAllocSet().insert(
mbarrier_tokens_alloc);
} else {
// create and allocate a memory barrier
TensorView* mbarrier = TensorViewBuilder()
.shape(std::vector<int64_t>{})
.dtype(DataType::UInt)
.contiguity(true)
.build();
mbarrier->setMemoryType(MemoryType::Shared);
auto mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
mbarrier,
simplifyExpr(SimplifyingIrBuilder::maybeCastExpr(
DataType::UInt32,
lower_utils::getNumThreadsInTensorView(
expr->output(0)->as<TensorView>()))));
auto sync_init = IrBuilder::create<kir::BlockSync>();
auto mbarrier_inval =
IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);
auto sync_inval = IrBuilder::create<kir::BlockSync>();

kir::Allocate* mbarrier_alloc =
IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);
Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();
registerInsertBefore(expr, mbarrier_alloc, expr_scope);
registerInsertBefore(expr, mbarrier_init, expr_scope);
registerInsertBefore(expr, sync_init, expr_scope);
registerInsertAfter(expr, mbarrier_inval, expr_scope);
registerInsertAfter(expr, sync_inval, expr_scope);
GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
}
}
}

Expand Down
Loading
Loading