Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TMA support for circular buffering pass #2833

Merged
merged 40 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
12db3ee
Add allocation changes
rdspring1 Aug 21, 2024
2491171
Add Indexing changes
rdspring1 Aug 21, 2024
6d8ad5f
Add circular buffering pass and testing
rdspring1 Aug 22, 2024
25c482d
Merge branch 'main' of https://github.com/nvidia/fuser into tma_cb
rdspring1 Sep 8, 2024
2f8d9e9
predicate and mbarrier changes
rdspring1 Sep 5, 2024
2a06157
add mbarrier_wait immediately
rdspring1 Sep 7, 2024
0c8858f
skip expressions_allocated_in_main_loop
rdspring1 Sep 5, 2024
f8123af
Ensure a full warp exists if there is elect sync predicate
rdspring1 Sep 9, 2024
508d674
comments≈
rdspring1 Sep 9, 2024
d4c7938
Merge branch 'main' of https://github.com/nvidia/fuser into tma_cb
rdspring1 Sep 9, 2024
ccfedfc
Add compatibility check for elect sync
rdspring1 Sep 16, 2024
f29aa22
add test for elect sync compatibility
rdspring1 Sep 16, 2024
f685a9b
Use MBarrierArrive
rdspring1 Sep 16, 2024
b84eb96
comments
rdspring1 Sep 17, 2024
c1fdec5
Add has_elect_sync_predicate to kernel_summary
rdspring1 Sep 17, 2024
4f011b5
Merge branch 'main' of https://github.com/nvidia/fuser into tma_cb
rdspring1 Sep 18, 2024
d132ef4
minor fixes
rdspring1 Sep 21, 2024
c393776
use inlineAt and inlineMost
rdspring1 Sep 22, 2024
dc9f20e
add string exception check
rdspring1 Sep 22, 2024
4058d73
clean-up
rdspring1 Sep 22, 2024
4cb4342
comment
rdspring1 Sep 23, 2024
cdbf609
comment
rdspring1 Sep 23, 2024
b6c4e20
comment
rdspring1 Sep 23, 2024
b137637
comment
rdspring1 Sep 23, 2024
c319e5c
generalize short-circuit
rdspring1 Sep 23, 2024
e8c7fd5
comment
rdspring1 Sep 23, 2024
95e7bd0
use scalar hoisting
rdspring1 Sep 23, 2024
89b61bb
rename
rdspring1 Sep 23, 2024
444252d
Merge branch 'main' into tma_cb
rdspring1 Sep 25, 2024
64ef3cb
Create TmaCircularBufferInfo to consolidate data fields. (#3004)
rdspring1 Sep 25, 2024
7ebffe3
Initialize and invalidate mbarrier in allocation pass
rdspring1 Sep 26, 2024
508dbb0
comments
rdspring1 Sep 26, 2024
7ee80d3
move to allocation pass
rdspring1 Sep 26, 2024
1f5bed1
create TmaCircularBufferInfo class
rdspring1 Sep 26, 2024
95cbba1
Merge branch 'main' into tma_cb
rdspring1 Sep 26, 2024
0a6abcd
rename CloneTmaCircularBufferLoopAndInsertSync
rdspring1 Sep 27, 2024
b9cc784
comments
rdspring1 Sep 27, 2024
afb4e1c
Add PointwiseCpAsync failing test
rdspring1 Sep 27, 2024
3bcc32c
Merge branch 'main' into tma_cb
rdspring1 Sep 28, 2024
5df582a
Merge branch 'main' into tma_cb
rdspring1 Sep 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 5 additions & 39 deletions csrc/device_lower/lower2device.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ class GpuLower : public NonCopyable {
return circular_buffer_info_;
}

TmaCircularBufferInfo& tmaCircularBufferInfo() {
return tma_circular_buffer_info_;
}

CommonScalarMap& commonScalarMap() {
return common_scalar_map_;
}
Expand Down Expand Up @@ -230,32 +234,6 @@ class GpuLower : public NonCopyable {
return ldst_mbarrier_map_;
}

std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap() {
return ldst_mbarrier_token_map_;
}

const std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap()
const {
return ldst_mbarrier_token_map_;
}

std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() {
return mbarrier_token_smem_alloc_set_;
}

const std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() const {
return mbarrier_token_smem_alloc_set_;
}

std::unordered_map<const Expr*, kir::TensorIndex*>& ldstMBarrierIndexMap() {
return ldst_mbarrier_index_map_;
}

const std::unordered_map<const Expr*, kir::TensorIndex*>&
ldstMBarrierIndexMap() const {
return ldst_mbarrier_index_map_;
}

bool isNvFuserZeroEnabled() {
if (isOptionDisabled(DisableOption::MagicZero)) {
return false;
Expand Down Expand Up @@ -360,6 +338,7 @@ class GpuLower : public NonCopyable {
ParallelDimensionMap parallel_dimension_map_;
NonDivisibleSplitInfo non_divisible_split_info_;
CircularBufferInfo circular_buffer_info_;
TmaCircularBufferInfo tma_circular_buffer_info_;
CommonScalarMap common_scalar_map_;
FusedReductionInfo fused_reduction_info_;
std::shared_ptr<const SyncMap> sync_map_;
Expand Down Expand Up @@ -389,19 +368,6 @@ class GpuLower : public NonCopyable {
//! for vectorization.
std::vector<std::pair<const Val*, std::string>> validations_;

// Keep track of placeholders for tokens returned by arrive/expected tx
// mbarrier operations for each load/store operation that requires such
// synchronization
std::unordered_map<const Expr*, TensorView*> ldst_mbarrier_token_map_;

// Collection of kir::Allocate for smem buffers used for mbarrier and token
// objects from cpAsyncBulk synchronization
std::unordered_set<const Expr*> mbarrier_token_smem_alloc_set_;

// Keep track what mbarrier object is used in load/store operation that
// requires such synchronization, required by indexing pass
std::unordered_map<const Expr*, kir::TensorIndex*> ldst_mbarrier_index_map_;

Fusion* fusion_ = nullptr;

// A temporary flag which is true if the fusion uses any feature that requires
Expand Down
35 changes: 28 additions & 7 deletions csrc/device_lower/pass/alias_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,7 @@ struct AllocationInfo {
const kir::Allocate* alias_to = nullptr;
bool is_inner_alias = false;
bool should_try_alias = true;
bool is_cp_async_bulk = false;
MemoryType mem_type = MemoryType::Local;
DataType data_type = DataType::Float;
std::string size_expr;
Expand Down Expand Up @@ -840,6 +841,9 @@ class AllocationInfoMap : private kir::IrVisitor {
alloc_info->size_expr = size_print;
alloc_info->loop_info = current_stack_.back();
alloc_info->should_try_alias = should_try_alias;
alloc_info->is_cp_async_bulk =
(tv->definition() != nullptr &&
ir_utils::isCpAsyncBulk(tv->definition()));

// record short cuts
allocation_info_map_[alloc] = alloc_info;
Expand Down Expand Up @@ -886,23 +890,37 @@ class AllocationInfoMap : private kir::IrVisitor {
// The liveness of the mbarrier and its token are mapped together.
// The token is the mbarrier state of the last phase.
if (auto init = dynamic_cast<kir::MBarrierInit*>(expr)) {
mark_liveness(init->mbarrier()->as<TensorView>(), /*is_write=*/true);
TensorView* tv = (init->mbarrier()->isA<kir::TensorIndex>())
? init->mbarrier()->as<kir::TensorIndex>()->view()
: init->mbarrier()->as<TensorView>();
mark_liveness(tv, /*is_write=*/true);

// Register start of lifetime for a mbarrier token returned by
// MBarrierArriveExpectTx and MBarrierArrive.
if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
if (GpuLower::current()
->tmaCircularBufferInfo()
.ldst_mbarrier_token_map.count(expr) > 0) {
mark_liveness(
GpuLower::current()->ldstMBarrierTokenMap()[expr],
GpuLower::current()
->tmaCircularBufferInfo()
.ldst_mbarrier_token_map[expr],
/*is_write=*/true);
}
} else if (auto inval = dynamic_cast<kir::MBarrierInvalidate*>(expr)) {
mark_liveness(inval->mbarrier()->as<TensorView>(), /*is_write=*/false);
TensorView* tv = (inval->mbarrier()->isA<kir::TensorIndex>())
? inval->mbarrier()->as<kir::TensorIndex>()->view()
: inval->mbarrier()->as<TensorView>();
mark_liveness(tv, /*is_write=*/false);

// Register end of lifetime for a mbarrier token returned by
// returned by MBarrierArriveExpectTx and MBarrierArrive
if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
if (GpuLower::current()
->tmaCircularBufferInfo()
.ldst_mbarrier_token_map.count(expr) > 0) {
mark_liveness(
GpuLower::current()->ldstMBarrierTokenMap()[expr],
GpuLower::current()
->tmaCircularBufferInfo()
.ldst_mbarrier_token_map[expr],
/*is_write=*/false);
}
}
Expand Down Expand Up @@ -1761,7 +1779,10 @@ class StackBasedSharedMemAllocator : kir::IrVisitor {
auto top_size = allocSizeBytes(top_alloc);
auto unaligned_address =
SimplifyingIrBuilder::addExpr(top_alloc->address(), top_size);
auto aligned_address = alignExpr(unaligned_address);
// Shared memory allocations must by 128B aligned for cpAsyncBulk
// operations to avoid CUDA_ERROR_MISALIGNED_ADDRESS.
auto aligned_address = alignExpr(
unaligned_address, (alloc_info->is_cp_async_bulk) ? 128 : 16);
// TODO: hoisting of addresses using for_loops_ recorded at first write
alloc->setAddress(aligned_address);
}
Expand Down
93 changes: 63 additions & 30 deletions csrc/device_lower/pass/allocation.cpp
rdspring1 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -483,19 +483,9 @@ class AllocationInserter : public kir::ExprMutator {
.build();
mbarrier->setMemoryType(MemoryType::Shared);

// The wait condition for mbarrier is a single thread and the expected
// number of transaction bytes
kir::MBarrierInit* mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
mbarrier, expr->container()->oneVal(DataType::UInt32));

kir::Allocate* mbarrier_alloc =
IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);

Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();

kir::MBarrierInvalidate* mbarrier_inval =
IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);

// For circular buffers we need to prepare a placeholder for the
// tokens created by 'MBarrierArriveExpectTx' IR node. The tokens are
// placed in shared memory and used by threads in a block.
Expand All @@ -510,38 +500,81 @@ class AllocationInserter : public kir::ExprMutator {
kir::Allocate* mbarrier_tokens_alloc = IrBuilder::create<kir::Allocate>(
mbarrier_tokens, MemoryType::Shared);

NVF_ERROR(ir_utils::isCpAsyncBulkLoad(expr));
LoadStoreOp* ldst = expr->as<LoadStoreOp>();
TensorView* out_tv = ldst->out()->as<TensorView>();
ForLoop* circular_buffer_loop =
GpuLower::current()->circularBufferInfo().getCircularBufferLoop(
out_tv, for_loops_);

auto&& [pre_prologue_init, mbarrier_init] =
initializeMbarrier(circular_buffer_loop, ldst, mbarrier);

auto&& [post_epilogue_inval, mbarrier_inval] =
invalidateMbarrier(circular_buffer_loop, ldst, mbarrier);

// Block sync is necessary to finish mbarrier initialization.
kir::BlockSync* sync = IrBuilder::create<kir::BlockSync>(false);

// Add tokens, mbarriers, init, and inval operations around tma
// expression like this:
//
// __shared__ tokens[num_stages];
// __shared__ mbarrier[num_stages];
// for (circular_buffer_stage) {
// init(mbarrier[stage]);
// }
// block_sync();
//
// for (circular_buffer_loop) {
// __shared__ tokens[num_stages];
// __shared__ mbarrier[num_stages];
// init(mbarrier);
// cp.async.bulk(data, mbarrier);
// inval(mbarrier);
// }
//
// for (circular_buffer_stage) {
// inval(mbarrier[stage]);
// }
//

// NOTE: Block sync ir node is not added here. It will be added in the
// circular buffering pass
registerInsertBefore(expr, mbarrier_tokens_alloc, expr_scope);
registerInsertBefore(expr, mbarrier_alloc, expr_scope);
registerInsertBefore(expr, mbarrier_init, expr_scope);
registerInsertAfter(expr, mbarrier_inval, expr_scope);
// Find the scope containing the circular buffer for-loop. It is the
// scope one level higher than the circular buffer loop scope in scope_.
auto scope_iter = std::find(
scope_.begin(), scope_.end(), &circular_buffer_loop->body());
NVF_ERROR(scope_iter != scope_.end());
Scope* scope_containing_circular_buffer_loop =
(scope_iter == scope_.begin()) ? nullptr : *(scope_iter - 1);
registerInsertBefore(
circular_buffer_loop,
mbarrier_tokens_alloc,
scope_containing_circular_buffer_loop);
registerInsertBefore(
circular_buffer_loop,
mbarrier_alloc,
scope_containing_circular_buffer_loop);

registerInsertBefore(
circular_buffer_loop,
pre_prologue_init,
scope_containing_circular_buffer_loop);
registerInsertBefore(
circular_buffer_loop, sync, scope_containing_circular_buffer_loop);
registerInsertAfter(
circular_buffer_loop,
post_epilogue_inval,
scope_containing_circular_buffer_loop);

// Map LoadStoreOp expression to ir nodes created in this pass
GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
GpuLower::current()->ldstMBarrierTokenMap()[expr] = mbarrier_tokens;
GpuLower::current()
->tmaCircularBufferInfo()
.ldst_mbarrier_token_map[expr] = mbarrier_tokens;
// Register tokens placeholder for MBarrierInit and MBarrierInvalidate,
// needed to manage life time of smem buffor in alias memory
GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_init] =
mbarrier_tokens;
GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_inval] =
mbarrier_tokens;
// Keep track of kir::Allocate for mBarrier and token objects,
// to simplify circular buffering pass logic
GpuLower::current()->mBarrierTokenSmemAllocSet().insert(mbarrier_alloc);
GpuLower::current()->mBarrierTokenSmemAllocSet().insert(
mbarrier_tokens_alloc);
GpuLower::current()
->tmaCircularBufferInfo()
.ldst_mbarrier_token_map[mbarrier_init] = mbarrier_tokens;
GpuLower::current()
->tmaCircularBufferInfo()
.ldst_mbarrier_token_map[mbarrier_inval] = mbarrier_tokens;
} else {
// create and allocate a memory barrier
TensorView* mbarrier = TensorViewBuilder()
Expand Down
Loading
Loading