NVIDIA · rdspring1 · Sep 5, 2024 · Aug 21, 2024 · Aug 21, 2024 · Sep 3, 2024
diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h
@@ -226,6 +226,32 @@ class GpuLower : public NonCopyable {
     return ldst_mbarrier_map_;
   }
 
+  std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap() {
+    return ldst_mbarrier_token_map_;
+  }
+
+  const std::unordered_map<const Expr*, TensorView*>& ldstMBarrierTokenMap()
+      const {
+    return ldst_mbarrier_token_map_;
+  }
+
+  std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() {
+    return mbarrier_token_smem_alloc_set_;
+  }
+
+  const std::unordered_set<const Expr*>& mBarrierTokenSmemAllocSet() const {
+    return mbarrier_token_smem_alloc_set_;
+  }
+
+  std::unordered_map<const Expr*, kir::TensorIndex*>& ldstMBarrierIndexMap() {
+    return ldst_mbarrier_index_map_;
+  }
+
+  const std::unordered_map<const Expr*, kir::TensorIndex*>&
+  ldstMBarrierIndexMap() const {
+    return ldst_mbarrier_index_map_;
+  }
+
   bool isNvFuserZeroEnabled() {
     if (isOptionDisabled(DisableOption::MagicZero)) {
       return false;
@@ -359,6 +385,19 @@ class GpuLower : public NonCopyable {
   //! for vectorization.
   std::vector<std::pair<const Val*, std::string>> validations_;
 
+  // Keep track of placeholders for tokens returned by arrive/expected tx
+  // mbarrier operations for each load/store operation that requires such
+  // synchronization
+  std::unordered_map<const Expr*, TensorView*> ldst_mbarrier_token_map_;
+
+  // Collection of kir::Allocate for smem buffers used for mbarrier and token
+  // objects from cpAsyncBulk synchronization
+  std::unordered_set<const Expr*> mbarrier_token_smem_alloc_set_;
+
+  // Keep track what mbarrier object is used in load/store operation that
+  // requires such synchronization, required by indexing pass
+  std::unordered_map<const Expr*, kir::TensorIndex*> ldst_mbarrier_index_map_;
+
   Fusion* fusion_ = nullptr;
 
   // A temporary flag which is true if the fusion uses any feature that requires

diff --git a/csrc/device_lower/pass/alias_memory.cpp b/csrc/device_lower/pass/alias_memory.cpp
@@ -866,20 +866,47 @@ class AllocationInfoMap : private kir::IrVisitor {
   }
 
   void collectLivenessInfoOfExprMBarrier(Expr* expr) {
-    const auto expr_pos = scope_map_.getExprPos(expr);
+    int64_t expr_pos = scope_map_.getExprPos(expr);
 
+    auto mark_liveness = [&expr_pos, this](TensorView* tv, bool is_write) {
+      AllocationInfo* alloc_info = getAllocInfoFromTV(tv);
+      if (is_write) {
+        alloc_info->inner_live_interval->markWrite(expr_pos);
+      } else {
+        alloc_info->inner_live_interval->markRead(expr_pos);
+      }
+      ScopeInfo* outer_loop_info = ascendLoopNestToSameLevelAs(alloc_info);
+      int64_t outer_pos =
+          outer_loop_info ? outer_loop_info->start_pos : expr_pos;
+      if (is_write) {
+        alloc_info->outer_live_interval->markWrite(outer_pos);
+      } else {
+        alloc_info->outer_live_interval->markRead(outer_pos);
+      }
+    };
+
+    // The liveness of the mbarrier and its token are mapped together.
+    // The token is the mbarrier state of the last phase.
     if (auto init = dynamic_cast<kir::MBarrierInit*>(expr)) {
-      auto alloc_info = getAllocInfoFromTV(init->mbarrier()->as<TensorView>());
-      alloc_info->inner_live_interval->markWrite(expr_pos);
-      auto outer_loop_info = ascendLoopNestToSameLevelAs(alloc_info);
-      auto write_pos = outer_loop_info ? outer_loop_info->start_pos : expr_pos;
-      alloc_info->outer_live_interval->markWrite(write_pos);
+      mark_liveness(init->mbarrier()->as<TensorView>(), /*is_write=*/true);
+
+      // Register start of lifetime for a mbarrier token returned by
+      // MBarrierArriveExpectTx and MBarrierArrive.
+      if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
+        mark_liveness(
+            GpuLower::current()->ldstMBarrierTokenMap()[expr],
+            /*is_write=*/true);
+      }
     } else if (auto inval = dynamic_cast<kir::MBarrierInvalidate*>(expr)) {
-      auto alloc_info = getAllocInfoFromTV(inval->mbarrier()->as<TensorView>());
-      alloc_info->inner_live_interval->markRead(expr_pos);
-      auto outer_loop_info = ascendLoopNestToSameLevelAs(alloc_info);
-      auto write_pos = outer_loop_info ? outer_loop_info->start_pos : expr_pos;
-      alloc_info->outer_live_interval->markRead(write_pos);
+      mark_liveness(inval->mbarrier()->as<TensorView>(), /*is_write=*/false);
+
+      // Register end of lifetime for a mbarrier token returned by
+      // returned by MBarrierArriveExpectTx and MBarrierArrive
+      if (GpuLower::current()->ldstMBarrierTokenMap().count(expr) > 0) {
+        mark_liveness(
+            GpuLower::current()->ldstMBarrierTokenMap()[expr],
+            /*is_write=*/false);
+      }
     }
   }
 

diff --git a/csrc/device_lower/pass/allocation.cpp b/csrc/device_lower/pass/allocation.cpp
@@ -274,6 +274,8 @@ class AllocationInserter : public kir::ExprMutator {
       return;
     }
 
+    int64_t circular_buffer_depth = 1;
+
     // Found where the allocation needs to be inserted
 
     for (const auto i : c10::irange(expr->outputs().size())) {
@@ -345,6 +347,16 @@ class AllocationInserter : public kir::ExprMutator {
       auto alloc_expr = createAllocExpr(allocation, is_output);
       auto init_expr = createInitExpr(allocation, init);
 
+      // Find the largest circular buffer depth; Used for tma buffer allocation
+      if (out_tv->isCircularBuffered() && circular_buffer_depth == 1) {
+        circular_buffer_depth = out_tv->circularBufferDepth();
+      }
+      NVF_ERROR(
+          circular_buffer_depth == 1 ||
+              circular_buffer_depth == out_tv->circularBufferDepth(),
+          "Expected all output TensorViews for the same expression ",
+          "to have the same circular_buffer_depth");
+
       // Write information to GPULower
       writeInfoToGPULower(allocation, alloc_expr);
 
@@ -378,33 +390,96 @@ class AllocationInserter : public kir::ExprMutator {
     // solution, we should remove this after we have a better way to handle
     // synchronizations for cp.async.bulk.
     if (ir_utils::isCpAsyncBulkLoad(expr)) {
-      // create and allocate a memory barrier
-      TensorView* mbarrier = TensorViewBuilder()
-                                 .shape(std::vector<int64_t>{})
-                                 .dtype(DataType::UInt)
-                                 .contiguity(true)
-                                 .build();
-      mbarrier->setMemoryType(MemoryType::Shared);
-      auto mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
-          mbarrier,
-          simplifyExpr(SimplifyingIrBuilder::maybeCastExpr(
-              DataType::UInt32,
-              lower_utils::getNumThreadsInTensorView(
-                  expr->output(0)->as<TensorView>()))));
-      auto sync_init = IrBuilder::create<kir::BlockSync>();
-      auto mbarrier_inval =
-          IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);
-      auto sync_inval = IrBuilder::create<kir::BlockSync>();
-
-      kir::Allocate* mbarrier_alloc =
-          IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);
-      Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();
-      registerInsertBefore(expr, mbarrier_alloc, expr_scope);
-      registerInsertBefore(expr, mbarrier_init, expr_scope);
-      registerInsertBefore(expr, sync_init, expr_scope);
-      registerInsertAfter(expr, mbarrier_inval, expr_scope);
-      registerInsertAfter(expr, sync_inval, expr_scope);
-      GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
+      if (circular_buffer_depth > 1) {
+        // Create and allocate a memory barrier. If this is a circular buffer,
+        // then allocate an array of mbarier objects. mbarrier::init and
+        // mbarrier::inval will be updated in circular buffering pass, but we
+        // add them here to handle shared memory correctly in alias memory pass.
+        TensorView* mbarrier =
+            TensorViewBuilder()
+                .shape(std::vector<int64_t>{circular_buffer_depth})
+                .dtype(DataType::UInt)
+                .contiguity(true)
+                .build();
+        mbarrier->setMemoryType(MemoryType::Shared);
+
+        // The wait condition for mbarrier is a single thread and the expected
+        // number of transaction bytes
+        kir::MBarrierInit* mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
+            mbarrier, expr->container()->oneVal(DataType::UInt32));
+
+        kir::Allocate* mbarrier_alloc =
+            IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);
+
+        Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();
+
+        kir::MBarrierInvalidate* mbarrier_inval =
+            IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);
+
+        // For circular buffers we need to prepare a placeholder for the
+        // tokens created by 'MBarrierArriveExpectTx' IR node. The tokens are
+        // placed in shared memory and used by threads in a block.
+        TensorView* mbarrier_tokens =
+            TensorViewBuilder()
+                .shape(std::vector<int64_t>{circular_buffer_depth})
+                .dtype(DataType::UInt)
+                .contiguity(true)
+                .build();
+        mbarrier_tokens->setMemoryType(MemoryType::Shared);
+
+        kir::Allocate* mbarrier_tokens_alloc = IrBuilder::create<kir::Allocate>(
+            mbarrier_tokens, MemoryType::Shared);
+
+        // NOTE: Block sync ir node is not added here. It will be added in the
+        // circular buffering pass
+        registerInsertBefore(expr, mbarrier_tokens_alloc, expr_scope);
+        registerInsertBefore(expr, mbarrier_alloc, expr_scope);
+        registerInsertBefore(expr, mbarrier_init, expr_scope);
+        registerInsertAfter(expr, mbarrier_inval, expr_scope);
+
+        // Map LoadStoreOp expression to ir nodes created in this pass
+        GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
+        GpuLower::current()->ldstMBarrierTokenMap()[expr] = mbarrier_tokens;
+        // Register tokens placeholder for MBarrierInit and MBarrierInvalidate,
+        //  needed to manage life time of smem buffor in alias memory
+        GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_init] =
+            mbarrier_tokens;
+        GpuLower::current()->ldstMBarrierTokenMap()[mbarrier_inval] =
+            mbarrier_tokens;
+        // Keep track of kir::Allocate for mBarrier and token objects,
+        //  to simplify circular buffering pass logic
+        GpuLower::current()->mBarrierTokenSmemAllocSet().insert(mbarrier_alloc);
+        GpuLower::current()->mBarrierTokenSmemAllocSet().insert(
+            mbarrier_tokens_alloc);
+      } else {
+        // create and allocate a memory barrier
+        TensorView* mbarrier = TensorViewBuilder()
+                                   .shape(std::vector<int64_t>{})
+                                   .dtype(DataType::UInt)
+                                   .contiguity(true)
+                                   .build();
+        mbarrier->setMemoryType(MemoryType::Shared);
+        auto mbarrier_init = IrBuilder::create<kir::MBarrierInit>(
+            mbarrier,
+            simplifyExpr(SimplifyingIrBuilder::maybeCastExpr(
+                DataType::UInt32,
+                lower_utils::getNumThreadsInTensorView(
+                    expr->output(0)->as<TensorView>()))));
+        auto sync_init = IrBuilder::create<kir::BlockSync>();
+        auto mbarrier_inval =
+            IrBuilder::create<kir::MBarrierInvalidate>(mbarrier);
+        auto sync_inval = IrBuilder::create<kir::BlockSync>();
+
+        kir::Allocate* mbarrier_alloc =
+            IrBuilder::create<kir::Allocate>(mbarrier, MemoryType::Shared);
+        Scope* expr_scope = scope_.empty() ? nullptr : scope_.back();
+        registerInsertBefore(expr, mbarrier_alloc, expr_scope);
+        registerInsertBefore(expr, mbarrier_init, expr_scope);
+        registerInsertBefore(expr, sync_init, expr_scope);
+        registerInsertAfter(expr, mbarrier_inval, expr_scope);
+        registerInsertAfter(expr, sync_inval, expr_scope);
+        GpuLower::current()->ldstMBarrierMap()[expr] = mbarrier;
+      }
     }
   }