diff --git a/csrc/device_lower/pass/circular_buffer.cpp b/csrc/device_lower/pass/circular_buffer.cpp index d547185e02b..22741558fd6 100644 --- a/csrc/device_lower/pass/circular_buffer.cpp +++ b/csrc/device_lower/pass/circular_buffer.cpp @@ -582,8 +582,8 @@ class TmaCircularBufferLoopCloner : public CircularBufferLoopCloner { LoadStoreOp* ldst = expr->as(); - // There should be a single mbarrier_arrive_tx_ for all ldst in current - // stage. + // There should be a single mbarrier_arrive_tx_ for each cpAsynBulk load + // expression. NVF_ERROR(mbarrier_arrive_tx_ == nullptr); mbarrier_arrive_tx_ = createMbarrierArriveExpectTx(ldst, current_load_stage_);