NVIDIA · rdspring1 · Sep 29, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024
diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h
@@ -182,6 +182,10 @@ class GpuLower : public NonCopyable {
     return circular_buffer_info_;
   }
 
+  TmaCircularBufferInfo& tmaCircularBufferInfo() {
+    return tma_circular_buffer_info_;
+  }
+
   CommonScalarMap& commonScalarMap() {
     return common_scalar_map_;
   }
@@ -230,32 +234,6 @@ class GpuLower : public NonCopyable {
     return ldst_mbarrier_map_;
   }
 
-  std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap() {
-    return ldst_mbarrier_token_map_;
-  }
-
-  const std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap()
-      const {
-    return ldst_mbarrier_token_map_;
-  }
-
-  std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() {
-    return mbarrier_token_smem_alloc_set_;
-  }
-
-  const std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() const {
-    return mbarrier_token_smem_alloc_set_;
-  }
-
-  std::unordered_map<const Expr*, kir::TensorIndex*>& ldstMBarrierIndexMap() {
-    return ldst_mbarrier_index_map_;
-  }
-
-  const std::unordered_map<const Expr*, kir::TensorIndex*>&
-  ldstMBarrierIndexMap() const {
-    return ldst_mbarrier_index_map_;
-  }
-
   bool isNvFuserZeroEnabled() {
     if (isOptionDisabled(DisableOption::MagicZero)) {
       return false;
@@ -360,6 +338,7 @@ class GpuLower : public NonCopyable {
   ParallelDimensionMap parallel_dimension_map_;
   NonDivisibleSplitInfo non_divisible_split_info_;
   CircularBufferInfo circular_buffer_info_;
+  TmaCircularBufferInfo tma_circular_buffer_info_;
   CommonScalarMap common_scalar_map_;
   FusedReductionInfo fused_reduction_info_;
   std::shared_ptr<const SyncMap> sync_map_;
@@ -389,19 +368,6 @@ class GpuLower : public NonCopyable {
   //! for vectorization.
   std::vector<std::pair<const Val*, std::string>> validations_;
 
-  // Keep track of placeholders for tokens returned by arrive/expected tx
-  // mbarrier operations for each load/store operation that requires such
-  // synchronization
-  std::unordered_map<const Expr*, TensorView*> ldst_mbarrier_token_map_;
-
-  // Collection of kir::Allocate for smem buffers used for mbarrier and token
-  // objects from cpAsyncBulk synchronization
-  std::unordered_set<const Expr*> mbarrier_token_smem_alloc_set_;
-
-  // Keep track what mbarrier object is used in load/store operation that
-  // requires such synchronization, required by indexing pass
-  std::unordered_map<const Expr*, kir::TensorIndex*> ldst_mbarrier_index_map_;
-
   Fusion* fusion_ = nullptr;
 
   // A temporary flag which is true if the fusion uses any feature that requires

diff --git a/csrc/device_lower/pass/alias_memory.cpp b/csrc/device_lower/pass/alias_memory.cpp
@@ -562,6 +562,7 @@ struct AllocationInfo {
   const kir::Allocate* alias_to = nullptr;
   bool is_inner_alias = false;
   bool should_try_alias = true;
+  bool is_cp_async_bulk = false;
   MemoryType mem_type = MemoryType::Local;
   DataType data_type = DataType::Float;
   std::string size_expr;
@@ -840,6 +841,9 @@ class AllocationInfoMap : private kir::IrVisitor {
     alloc_info->size_expr = size_print;
     alloc_info->loop_info = current_stack_.back();
     alloc_info->should_try_alias = should_try_alias;
+    alloc_info->is_cp_async_bulk =
+        (tv->definition() != nullptr &&
+         ir_utils::isCpAsyncBulk(tv->definition()));
 
     // record short cuts
     allocation_info_map_[alloc] = alloc_info;
@@ -886,23 +890,37 @@ class AllocationInfoMap : private kir::IrVisitor {
     // The liveness of the mbarrier and its token are mapped together.
     // The token is the mbarrier state of the last phase.
     if (auto init = dynamic_cast<kir::MBarrierInit*>(expr)) {
-      mark_liveness(init->mbarrier()->as<TensorView>(), /*is_write=*/true);
+      TensorView* tv = (init->mbarrier()->isA<kir::TensorIndex>())
+          ? init->mbarrier()->as<kir::TensorIndex>()->view()
+          : init->mbarrier()->as<TensorView>();
+      mark_liveness(tv, /*is_write=*/true);
 
       // Register start of lifetime for a mbarrier token returned by
       // MBarrierArriveExpectTx and MBarrierArrive.
-      if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
+      if (GpuLower::current()
+              ->tmaCircularBufferInfo()
+              .ldst_mbarrier_token_map.count(expr) > 0) {
         mark_liveness(
-            GpuLower::current()->ldstMBarrierTokenMap()[expr],
+            GpuLower::current()
+                ->tmaCircularBufferInfo()
+                .ldst_mbarrier_token_map[expr],
             /*is_write=*/true);
       }
     } else if (auto inval = dynamic_cast<kir::MBarrierInvalidate*>(expr)) {
-      mark_liveness(inval->mbarrier()->as<TensorView>(), /*is_write=*/false);
+      TensorView* tv = (inval->mbarrier()->isA<kir::TensorIndex>())
+          ? inval->mbarrier()->as<kir::TensorIndex>()->view()
+          : inval->mbarrier()->as<TensorView>();
+      mark_liveness(tv, /*is_write=*/false);
 
       // Register end of lifetime for a mbarrier token returned by
       // returned by MBarrierArriveExpectTx and MBarrierArrive
-      if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
+      if (GpuLower::current()
+              ->tmaCircularBufferInfo()
+              .ldst_mbarrier_token_map.count(expr) > 0) {
         mark_liveness(
-            GpuLower::current()->ldstMBarrierTokenMap()[expr],
+            GpuLower::current()
+                ->tmaCircularBufferInfo()
+                .ldst_mbarrier_token_map[expr],
             /*is_write=*/false);
       }
     }
@@ -1761,7 +1779,10 @@ class StackBasedSharedMemAllocator : kir::IrVisitor {
       auto top_size = allocSizeBytes(top_alloc);
       auto unaligned_address =
           SimplifyingIrBuilder::addExpr(top_alloc->address(), top_size);
-      auto aligned_address = alignExpr(unaligned_address);
+      // Shared memory allocations must by 128B aligned for cpAsyncBulk
+      // operations to avoid CUDA_ERROR_MISALIGNED_ADDRESS.
+      auto aligned_address = alignExpr(
+          unaligned_address, (alloc_info->is_cp_async_bulk) ? 128 : 16);
       // TODO: hoisting of addresses using for_loops_ recorded at first write
       alloc->setAddress(aligned_address);
     }

diff --git a/csrc/device_lower/pass/allocation.cpp b/csrc/device_lower/pass/allocation.cpp
@@ -483,19 +483,9 @@ class AllocationInserter : public kir::ExprMutator {
                 .build();
         mbarrier->setMemoryType(MemoryType::Shared);
 
-        // The wait condition for mbarrier is a single thread and the expected
-        // number of transaction bytes
-        kir::MBarrierInit* mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
-            mbarrier, expr->container()->oneVal(DataType::UInt32));
-
         kir::Allocate* mbarrier_alloc =
             IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);
 
-        Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();
-
-        kir::MBarrierInvalidate* mbarrier_inval =
-            IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);
-
         // For circular buffers we need to prepare a placeholder for the
         // tokens created by 'MBarrierArriveExpectTx' IR node. The tokens are
         // placed in shared memory and used by threads in a block.
@@ -510,38 +500,81 @@ class AllocationInserter : public kir::ExprMutator {
         kir::Allocate* mbarrier_tokens_alloc = IrBuilder::create<kir::Allocate>(
             mbarrier_tokens, MemoryType::Shared);
 
+        NVF_ERROR(ir_utils::isCpAsyncBulkLoad(expr));
+        LoadStoreOp* ldst = expr->as<LoadStoreOp>();
+        TensorView* out_tv = ldst->out()->as<TensorView>();
+        ForLoop* circular_buffer_loop =
+            GpuLower::current()->circularBufferInfo().getCircularBufferLoop(
+                out_tv, for_loops_);
+
+        auto&& [pre_prologue_init, mbarrier_init] =
+            initializeMbarrier(circular_buffer_loop, ldst, mbarrier);
+
+        auto&& [post_epilogue_inval, mbarrier_inval] =
+            invalidateMbarrier(circular_buffer_loop, ldst, mbarrier);
+
+        // Block sync is necessary to finish mbarrier initialization.
+        kir::BlockSync* sync = IrBuilder::create<kir::BlockSync>(false);
+
         // Add tokens, mbarriers, init, and inval operations around tma
         // expression like this:
         //
+        // __shared__ tokens[num_stages];
+        // __shared__ mbarrier[num_stages];
+        // for (circular_buffer_stage) {
+        //   init(mbarrier[stage]);
+        // }
+        // block_sync();
+        //
         // for (circular_buffer_loop) {
-        //   __shared__ tokens[num_stages];
-        //   __shared__ mbarrier[num_stages];
-        //   init(mbarrier);
         //   cp.async.bulk(data, mbarrier);
-        //   inval(mbarrier);
         // }
+        //
+        // for (circular_buffer_stage) {
+        //   inval(mbarrier[stage]);
+        // }
+        //
 
-        // NOTE: Block sync ir node is not added here. It will be added in the
-        // circular buffering pass
-        registerInsertBefore(expr, mbarrier_tokens_alloc, expr_scope);
-        registerInsertBefore(expr, mbarrier_alloc, expr_scope);
-        registerInsertBefore(expr, mbarrier_init, expr_scope);
-        registerInsertAfter(expr, mbarrier_inval, expr_scope);
+        // Find the scope containing the circular buffer for-loop. It is the
+        // scope one level higher than the circular buffer loop scope in scope_.
+        auto scope_iter = std::find(
+            scope_.begin(), scope_.end(), &circular_buffer_loop->body());
+        NVF_ERROR(scope_iter != scope_.end());
+        Scope* scope_containing_circular_buffer_loop =
+            (scope_iter == scope_.begin()) ? nullptr : *(scope_iter - 1);
+        registerInsertBefore(
+            circular_buffer_loop,
+            mbarrier_tokens_alloc,
+            scope_containing_circular_buffer_loop);
+        registerInsertBefore(
+            circular_buffer_loop,
+            mbarrier_alloc,
+            scope_containing_circular_buffer_loop);
+
+        registerInsertBefore(
+            circular_buffer_loop,
+            pre_prologue_init,
+            scope_containing_circular_buffer_loop);
+        registerInsertBefore(
+            circular_buffer_loop, sync, scope_containing_circular_buffer_loop);
+        registerInsertAfter(
+            circular_buffer_loop,
+            post_epilogue_inval,
+            scope_containing_circular_buffer_loop);
 
         // Map LoadStoreOp expression to ir nodes created in this pass
         GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
-        GpuLower::current()->ldstMBarrierTokenMap()[expr] = mbarrier_tokens;
+        GpuLower::current()
+            ->tmaCircularBufferInfo()
+            .ldst_mbarrier_token_map[expr] = mbarrier_tokens;
         // Register tokens placeholder for MBarrierInit and MBarrierInvalidate,
         //  needed to manage life time of smem buffor in alias memory
-        GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_init] =
-            mbarrier_tokens;
-        GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_inval] =
-            mbarrier_tokens;
-        // Keep track of kir::Allocate for mBarrier and token objects,
-        //  to simplify circular buffering pass logic
-        GpuLower::current()->mBarrierTokenSmemAllocSet().insert(mbarrier_alloc);
-        GpuLower::current()->mBarrierTokenSmemAllocSet().insert(
-            mbarrier_tokens_alloc);
+        GpuLower::current()
+            ->tmaCircularBufferInfo()
+            .ldst_mbarrier_token_map[mbarrier_init] = mbarrier_tokens;
+        GpuLower::current()
+            ->tmaCircularBufferInfo()
+            .ldst_mbarrier_token_map[mbarrier_inval] = mbarrier_tokens;
       } else {
         // create and allocate a memory barrier
         TensorView* mbarrier = TensorViewBuilder()