Double buffering for Hopper arch #1484

drzejan2 · 2023-12-08T11:58:38Z

The items of this change:

prepare new set of double buffer test cases, targeting H100 arch
update implementation of indecing computation for cpAsyncBulk operations

drzejan2 · 2023-12-27T14:02:56Z

I changed the approach for adding the support Hopper TMA double buffering / circular buffering - I decided to focus on how the final kernel should look like.

Notes:

kernels are based on code that is currently generated by double buffering pass so any variable starting with a is a result of manually adding variable to the sources,
the bulk inner domain has a fixed size of 32 elements, see test definition, this is the reason for 32LL literals showing in multiple places in the kernels,

The generated kernel is a result of running the following test case (link).

@naoyam can I ask you to check if prepared kernels make sense? I will start working on preparing changes in the infrastructure to produce code like below.

@zasdfgbnm for visibility.

Double buffer kernel

__global__ void nvfuser_none_f0_c0_r0_g0(Tensor<float, 1, 1> T0, const __grid_constant__ TensorMap var0, Tensor<float, 1, 1> T2) {
  alignas(16) extern __shared__ char array[];
  const unsigned smem_offset = 0;
  NVFUSER_DEFINE_MAGIC_ZERO;
  nvfuser_index_t i1;
  i1 = ceilDiv(T0.logical_size[0LL], 32);
  const TensorMap* ptr2;
  ptr2 = &var0;

  // NOTE: the offset between mbarrier objects in an array of mbarrier objects
  // NOTE: barrier_t is not defined in this scope, can we include barrier.h?
  // NOTE: we shouldn't include any headers, so we will use hard-coded value, 8
  unsigned a10;
  a10 = 8;

  // NOTE: a placeholder for mbarier objects, 2 buffers, each 8 bytes
  // NOTE:: see alignment
  __shared__ __align__(64) uint64_t a11[16];

  // NOTE: the number of expected 'arrivals' of mbarrier, for cpAsyncBuld we use
  //       single thread to issue instruction
  uint64_t a12;
  a12 = 1;

  // NOTE: placeholder for tokens
  __shared__ __align__(64) uint64_t a13[2];

  // NOTE: buffer for loaded gmem, size of 32 floats, see T3 definition
  // NOTE: should be | 32 items * sizeof(floats) * number of stages
  __shared__ __align__(64) char array_2[32LL * 4LL * 2LL];
  float* T1 = reinterpret_cast<float*>(array_2); // reinterpret_cast<float*>(array + smem_offset + 0);

  // NOTE: start address of a placeholder for mbarrier objects
  uint64_t* T3 = reinterpret_cast<uint64_t*>(a11); // reinterpret_cast<uint64_t*>(array + smem_offset + (64LL * 2));

  // NOTE: init all mbarrier objects by the first thread in the block
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    for (unsigned a4 = 0; a4 < 2; ++a4) {
      mbarrier::init((toSmem(T3) + a10 * a4), a12);
    }
  }

  // NOTE: unrolled prolog loop
  // NOTE: can be merged with if expr with mbarrier init
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // NOTE: up to 32 x float -> 128 bytes
    a13[0] = mbarrier::arriveExpectTX((toSmem(T3) + a10 * 0), (uint32_t)(32LL * 4LL));
    Hopper::cpAsyncBulkTensorTileG2S((Hopper::CpAsyncBulkTensorTileG2SIndex<1>{ (&var0), (Array<nvfuser_index_t, 1, 1>{0 * 32}), (toSmem(T3) + a10 * 0)}), (toSmem(T1) + 128 * 0));
  }

  // NOTE: wait for all threads in block
  block_sync::sync<0>();

  // NOTE: main loop
  NVFUSER_UPDATE_MAGIC_ZERO;
  #pragma unroll 1
  for(nvfuser_index_t i6 = 0; i6 < (i1 - 1); ++i6) {
    // NOTE: i7/i8 are indexes (base is 32), while cpAsyncBuld is in bytes (base is 128 -> 32 * sizeof(float))
    nvfuser_index_t i7;
    i7 = 32LL * (i6 % 2);
    nvfuser_index_t i8;
    i8 = 32LL * i6;

    // NOTE: index of the current barrier
    nvfuser_index_t a20;
    a20 = (i6 % 2);

    // NOTE: index of the next stage to fetch
    nvfuser_index_t a21;
    a21 = ((i6 + 1) % 2);

    // NOTE: schedule the next stage in the queue - a21
    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
      // NOTE: up to 32 x float -> 128 bytes
      a13[a21] = mbarrier::arriveExpectTX((toSmem(T3) + a10 * a21), (uint32_t)(32LL * 4LL));
      // NOTE: coordinates are in items, with bulk being 32 floats, we read next box through TMA desc
      Hopper::cpAsyncBulkTensorTileG2S((Hopper::CpAsyncBulkTensorTileG2SIndex<1>{ (&var0), (Array<nvfuser_index_t, 1, 1>{ (i6 + 1) * 32 }), (toSmem(T3) + a10 * a21) }), (toSmem(T1) + (128 * a21)));
    }

    // NOTE: wait for the current barrier - a20
    // NOTE: 1st: mbarrier, 2nd: the expected token for barrier
    // NOTE: only single thread can arrive (1st), rest are waiting for current phase to complete
    mbarrier::wait((toSmem(T3) + a10 * a20), a13[a20]);

    NVFUSER_UPDATE_MAGIC_ZERO;
    #pragma unroll
    for(nvfuser_index_t i10 = 0; i10 < 32; ++i10) {
      nvfuser_index_t i11;
      i11 = i8 + (i10 + nvfuser_zero);
      if ((i11 < T0.logical_size[0LL])) {
        T2[i11]
           = T1[(i7 + i10)];
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO;
  }

  // NOTE: epilogue loop
  // NOTE: we can avoid epilogue loop by predicating the fetch cpAsyncBulk
  #pragma unroll 1
  for(nvfuser_index_t a30 = (i1 - 1); a30 < i1; ++a30) {
    nvfuser_index_t a31;
    a31 = 32LL * (a30 % 2);
    nvfuser_index_t a32;
    a32 = 32LL * a30;

    // NOTE: current barrier index
    nvfuser_index_t a33;
    a33 = (a30 % 2);

    // NOTE: wait for the current barrier to be processed - a33
    mbarrier::wait((toSmem(T3) + a10 * a33), a13[a33]);

    NVFUSER_UPDATE_MAGIC_ZERO;
    #pragma unroll
    for(nvfuser_index_t i10 = 0; i10 < 32; ++i10) {
      nvfuser_index_t i11;
      i11 = a32 + (i10 + nvfuser_zero);
      if ((i11 < T0.logical_size[0LL])) {
        T2[i11]
           = T1[(a31 + i10)];
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO;
  }

  // NOTE: inval memory with mbarrier objects
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    for (unsigned a4 = 0; a4 < 2; ++a4) {
      mbarrier::inval((toSmem(T3) + a10 * a4));
    }
  }
}

Circular buffer - there is notion of <stages> and prolog/epilogue loops are not inlined.

Circular buffer kernel

__global__ void nvfuser_none_f0_c0_r0_g0(Tensor<float, 1, 1> T0, const __grid_constant__ TensorMap var0, Tensor<float, 1, 1> T2) {
  alignas(16) extern __shared__ char array[];
  const unsigned smem_offset = 0;
  NVFUSER_DEFINE_MAGIC_ZERO;
  nvfuser_index_t i1;
  i1 = ceilDiv(T0.logical_size[0LL], 32);
  const TensorMap* ptr2;
  ptr2 = &var0;

  // NOTE: number of stages
  constexpr unsigned STAGES = 3;

  // NOTE: the offset between mbarrier objects in an array of mbarrier objects
  // NOTE: barrier_t is not defined in this scope, can we include barrier.h?
  // NOTE: we shouldn't include any headers, so we will use hard-coded value, 8
  unsigned a10;
  a10 = 8;

  // NOTE: a placeholder for mbarier objects, 2 buffers, each 8 bytes
  // NOTE:: see alignment
  __shared__ __align__(128) uint64_t a11[8LL * STAGES];

  // NOTE: the number of expected 'arrivals' of mbarrier, for cpAsyncBuld we use
  //       single thread to issue instruction
  uint64_t a12;
  a12 = 1;

  // NOTE: placeholder for tokens
  __shared__ __align__(128) uint64_t a13[STAGES];

  // NOTE: buffer for loaded gmem, size of 32 floats, see T3 definition
  // NOTE: should be | 32 items * sizeof(floats) * number of stages
  __shared__ __align__(128) char array_2[32LL * 4LL * STAGES];
  float* T1 = reinterpret_cast<float*>(array_2); // reinterpret_cast<float*>(array + smem_offset + 0);

  // NOTE: start address of a placeholder for mbarrier objects
  uint64_t* T3 = reinterpret_cast<uint64_t*>(a11); // reinterpret_cast<uint64_t*>(array + smem_offset + (64LL * 2));

  // NOTE: init all mbarrier objects by the first thread in the block
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    for (unsigned a4 = 0; a4 < STAGES; ++a4) {
      mbarrier::init((toSmem(T3) + a10 * a4), a12);
    }
  }

  // NOTE: unrolled prolog loop
  // NOTE: can be merged with if expr with mbarrier init
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    // NOTE: up to 32 x float -> 128 bytes
    a13[0] = mbarrier::arriveExpectTX((toSmem(T3) + a10 * 0), (uint32_t)(32LL * 4LL));
    Hopper::cpAsyncBulkTensorTileG2S((Hopper::CpAsyncBulkTensorTileG2SIndex<1>{ ptr2, (Array<nvfuser_index_t, 1, 1>{0 * 32}), (toSmem(T3) + a10 * 0)}), (toSmem(T1) + 128 * 0));
  }

  // NOTE: wait for all threads in block
  block_sync::sync<0>();

  // NOTE: main loop
  NVFUSER_UPDATE_MAGIC_ZERO;
  #pragma unroll 1
  for(nvfuser_index_t i6 = 0; i6 < (i1 - 1); ++i6) {
    // NOTE: i7/i8 are indexes (base is 32), while cpAsyncBuld is in bytes (base is 128 -> 32 * sizeof(float))
    nvfuser_index_t i7;
    i7 = 32LL * (i6 % STAGES);
    nvfuser_index_t i8;
    i8 = 32LL * i6;

    // NOTE: index of the current barrier
    nvfuser_index_t a20;
    a20 = (i6 % STAGES);

    // NOTE: index of the next stage to fetch
    nvfuser_index_t a21;
    a21 = ((i6 + 1) % STAGES);

    // NOTE: schedule the next stage in the queue - a21
    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
      // NOTE: up to 32 x float -> 128 bytes
      a13[a21] = mbarrier::arriveExpectTX((toSmem(T3) + a10 * a21), (uint32_t)(32LL * 4LL));
      // NOTE: coordinates are in items, with bulk being 32 floats, we read next box through TMA desc
      Hopper::cpAsyncBulkTensorTileG2S((Hopper::CpAsyncBulkTensorTileG2SIndex<1>{ ptr2, (Array<nvfuser_index_t, 1, 1>{ (i6 + 1) * 32 }), (toSmem(T3) + a10 * a21) }), (toSmem(T1) + (128 * a21)));
    }

    // NOTE: wait for the current barrier - a20
    // NOTE: 1st: mbarrier, 2nd: the expected token for barrier
    // NOTE: only single thread can arrive (1st), rest are waiting for current phase to complete
    mbarrier::wait((toSmem(T3) + a10 * a20), a13[a20]);

    NVFUSER_UPDATE_MAGIC_ZERO;
    #pragma unroll
    for(nvfuser_index_t i10 = 0; i10 < 32; ++i10) {
      nvfuser_index_t i11;
      i11 = i8 + (i10 + nvfuser_zero);
      if ((i11 < T0.logical_size[0LL])) {
        T2[i11]
           = T1[(i7 + i10)];
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO;
  }

  // NOTE: epilogue loop
  // NOTE: we can avoid epilogue loop by predicating the fetch cpAsyncBulk
  #pragma unroll 1
  for(nvfuser_index_t a30 = (i1 - STAGES + 1); a30 < i1; ++a30) {
    nvfuser_index_t a31;
    a31 = 32LL * (a30 % STAGES);
    nvfuser_index_t a32;
    a32 = 32LL * a30;

    // NOTE: current barrier index
    nvfuser_index_t a33;
    a33 = (a30 % STAGES);

    // NOTE: wait for the current barrier to be processed - a33
    mbarrier::wait((toSmem(T3) + a10 * a33), a13[a33]);

    NVFUSER_UPDATE_MAGIC_ZERO;
    #pragma unroll
    for(nvfuser_index_t i10 = 0; i10 < 32; ++i10) {
      nvfuser_index_t i11;
      i11 = a32 + (i10 + nvfuser_zero);
      if ((i11 < T0.logical_size[0LL])) {
        T2[i11]
           = T1[(a31 + i10)];
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO;
  }

  // NOTE: inval memory with mbarrier objects
  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    for (unsigned a4 = 0; a4 < 2; ++a4) {
      mbarrier::inval((toSmem(T3) + a10 * a4));
    }
  }
}

drzejan2 · 2024-02-22T10:18:49Z

The state of the implementation, as of 02/22/2024:

add handling of additional smem allocations needed to store tokens (token - an object used to synchronize memory operations that use mbarrier objects)
update smem allocations for mbarrier and token objects; double/circular buffering requires a number of stages of instances of these objects, per double-buffered TensorView
update mechanism for calculating smem allocations' lifetime (e.g. mbarrier starts before its initialize, and ends after invalidation)
modify indexing - we shift focus from allocation/index pass to double buffering pass w.r.t. cpAsyncBulk memory operations, but there are still cases where memory operation is cpAsyncBulk, but it is not a part of double buffering - for these cases we keep naïve synchronization mechanism, but at the same time we don't want to break what was done in Double Buffering pass (LoadStoreOp replacements / new loop introduction)
modify Double Buffering pass in lowering
- separate execution paths, based on LoadStoreOps in Double Buffer loop:
  - if there is no LoadStoreOp that is cpAsyncBulk, then use original (but with some changes) execution path (see insert)
  - if there is at least one LoadStoreOp that is cpAsyncBulk, then use insertTMA (name to be changed) which redefines structure of result kir
- new execution path introduces two new concepts, apart form existing prolog/main/epilogue loops (more details can be found at the end of this long comment):
  - 'pre-prolog' - a collection of expressions that contain smem allocations (so these are visible through the double-buffered loop) and mbarrier initialization, which is predicated so it is done only by the first thread in a block,
  - post-epilogue - a collection of expressions that handle mbarrier objects invalidation,
- to handle cpAsyncBulk / mbarrier::wait in main loop we introduce new indices (Val*, current and next), that are based on double buffering loop
- through Double Buffering Pass we actively use and update data that has been created and store during allocation and memory aliasing pass (e.g. LoadStoreOp -> smem TV / kir::Allocate collection / LoadStoreOp -> loop index), it is later used by indexing pass

@naoyam, @zasdfgbnm - can I ask you to check if the direction I'm heading is aligned with Fuser current and future architecture? Thanks!

NOTE: for the time being, please ignore changes in index_compute.cpp - these need to be redone and refined.

The overview of how result cloned loops should look like:

// pre-prolog:
// - smem allocations (mbarriers, tokens)
// - mbarrier init (0..stages)
//
// prolog loop:
// - 0th thread:
//   - issue 0..stages-1 cp async bulks
//
// main loop:
// - 0th thread:
//   - issue next cp async bulk
// - copy body, without
//   - smem allocations
//   - mbarrier inits
//   - mbarrier inval
//
// epilogue loop:
// - copy body, without
//   - smem allocations
//   - issuing cp async
//   - mbarrier inits
//   - mbarrier inval
//
// post-epilogue:
//  - 0th thread: loop with mbarriers inval

The scope of kir changes in prolog loop:

// Replace cpAsyncBulk type LoadStoreOp with:
// if (0th thread in block)
//    token[loop_idx] = mbarrier::arriveExpectTx(mbarrier[loop_idx])
//    cpAsyncBulk(mbarrier[loop_idx],...)
//
//  Where loop_idx is in range 0...stages-1

The scope of kir changes in main loop:

// Replace LoadStoreOp with:
//  if (0th thread in block)
//    token[next_stage] = mbarrier::arriveExpectTx(mbarrier[next_stage])
//    cpAsyncBulk(mbarrier[next_stage],...)
//  mbarrier::wait(token[curr_stage])
//
// Where mbarrier and token are smem arrays bound to the LoadStoreOp

naoyam · 2024-02-22T23:11:03Z

The state of the implementation, as of 02/22/2024:

add handling of additional smem allocations needed to store tokens (token - an object used to synchronize memory operations that use mbarrier objects)

update smem allocations for mbarrier and token objects; double/circular buffering requires a number of stages of instances of these objects, per double-buffered TensorView

update mechanism for calculating smem allocations' lifetime (e.g. mbarrier starts before its initialize, and ends after invalidation)

modify indexing - we shift focus from allocation/index pass to double buffering pass w.r.t. cpAsyncBulk memory operations, but there are still cases where memory operation is cpAsyncBulk, but it is not a part of double buffering - for these cases we keep naïve synchronization mechanism, but at the same time we don't want to break what was done in Double Buffering pass (LoadStoreOp replacements / new loop introduction)

modify Double Buffering pass in lowering

separate execution paths, based on LoadStoreOps in Double Buffer loop:

if there is no LoadStoreOp that is cpAsyncBulk, then use original (but with some changes) execution path (see insert)

if there is at least one LoadStoreOp that is cpAsyncBulk, then use insertTMA (name to be changed) which redefines structure of result kir

new execution path introduces two new concepts, apart form existing prolog/main/epilogue loops (more details can be found at the end of this long comment):

'pre-prolog' - a collection of expressions that contain smem allocations (so these are visible through the double-buffered loop) and mbarrier initialization, which is predicated so it is done only by the first thread in a block,

post-epilogue - a collection of expressions that handle mbarrier objects invalidation,

to handle cpAsyncBulk / mbarrier::wait in main loop we introduce new indices (Val*, current and next), that are based on double buffering loop

through Double Buffering Pass we actively use and update data that has been created and store during allocation and memory aliasing pass (e.g. LoadStoreOp -> smem TV / kir::Allocate collection / LoadStoreOp -> loop index), it is later used by indexing pass

@naoyam, @zasdfgbnm - can I ask you to check if the direction I'm heading is aligned with Fuser current and future architecture? Thanks!

It looks reasonable to me. At least, it seems like a natural extension of the current double buffering pass.

zasdfgbnm · 2024-03-05T01:43:33Z

csrc/device_lower/pass/alias_memory.cpp

+      // Register life time start for a smem placeholder with tokens
+      //  returned by MBarrierArriveExpectTx / MBarrierArrive
+      if (GpuLower::current()->ldstMBarrierTokenMap().count(expr)) {
+        markWrite(GpuLower::current()->ldstMBarrierTokenMap()[expr]);
+      }


Why do we need to special handle the token's liveness? Will it just work if we use the token's first write and last read?

The mbarrier and its token are mapped together. The token is the mbarrier state of the last phase.

Token liveness doesn't seem a strong priority. It is only a single int64_t.

zasdfgbnm · 2024-03-05T04:12:26Z

csrc/device_lower/pass/allocation.cpp

+      // Find the largest circular buffer depth, to be used for bulk load
+      // allocation
+      if (out_tv->isDoubleBuffered()) {
+        if (out_tv->isCircularBuffered()) {
+          circular_buffer_depth =
+              std::max(circular_buffer_depth, out_tv->circularBufferDepth());
+        } else {
+          circular_buffer_depth = std::max(circular_buffer_depth, 2u);
+        }
+      }
+


Is it supported to have multiple output with different circular buffer depth? If not, should we add an NVF_ERROR checking that?

Now, that you have pointed this out, it really makes no sense to pick the largest stage. In fact, each circular buffer should be considered separately (and own set of mbarriers/tokens). I will update this part of code.

Thanks!

zasdfgbnm · 2024-03-05T17:05:24Z

csrc/device_lower/pass/allocation.cpp

+        registerInsertBefore(expr, mbarrier_tokens_alloc, expr_scope);
+        registerInsertBefore(expr, mbarrier_alloc, expr_scope);
+        registerInsertBefore(expr, mbarrier_init, expr_scope);
+        registerInsertAfter(expr, mbarrier_inval, expr_scope);


So we are doing the init and inval inside the loop of cp.async, i.e. generate code like:

for i in range(I0): __shared__ tokens; __shared__ mbarrier; init(mbarrier); cp.async.bulk(data, mbarrier); inval(mbarrier);

and leave it for the double bufferring pass to fix it? Could you leave a comment in the code for this?

Yes, that's the current implementation - I will update the code here so it is clear that Double Buffering pass modify this part of the code. Having it done here has the benefit of easier control of shared memory allocations / storage liveness (the order of passes is insertAllocations-> reuseMemoryAllocations -> DoubleBufferPass).

Switching to approach, where all needed elements are added in double buffering pass (we detect LoadStoreOp for bulk copy op -> we add placeholder for tokes, mbarriers, init mbarrier and later inval for mbarriers) will have major consequences: adding to the system smem allocation (mbarrier / tokens) and life time analysis out-of-the order (pass-wise) might be a challenge.

I will think how we could have this done, in a clean way.

Thank you for the comment!

zasdfgbnm · 2024-03-05T21:25:21Z

csrc/device_lower/pass/double_buffer.cpp

+//       mbarrier::init(...);
+//     }
+//   }
+class CpAsyncBulkPrePrologue : public kir::IrVisitor {


Should we add a DoubleBufferLoopStage::PrePrologue and a DoubleBufferLoopStage::PostEpilogue and move the code below to DoubleBufferLoopCloner?

That's a good point.

For now I have it as separate class of functional classes - the main reason is that DoubleBufferLoopCloner returns kir::ForLoop while pre-prologue/post-epilogue are a simple collection (a list) of expressions. But you are right, we could have this done in smarter way where these two new 'regions' are handled along prolog/main/epilogue. I will try to unify this part of code, or at least leave a comment that architecture of this part of system should be reconsidered.

Thanks!

In our current main branch, all predicates of `cp.async.bulk` are skipped. It is skipped not because it should be like that, but instead, it is just a quick simple hack to allow us to incrementally build out TMA. Currently, TMA can only be used in a `<<<1, 1>>>` kernel, and it can only be used to copy the entire tensor, instead of copying a part of that tensor. Under this limitation, it totally makes sense to skip the predicates. However, it no longer makes sense to skip predicate generation for TMA as we are adding support for non-trivial cases. For example, in #1484, an `if (threadIdx.x == 0 && threadIdx.x == 0 && threadIdx.x == 0)` is manually created in the double buffering pass as a temporary solution. Also, I just started working on allowing TMA to be used in a non-`<<<1, 1>>>` kernel, where a thread predicate is clearly needed. In this PR, I am re-enabling predicate generation for TMA. For all the code that is already in main branch, this PR should be a no-op. I do not expect any change in the generated code for any TMA test. However, #1484 will be impacted in the sense that the `if (threadIdx.x == 0 && threadIdx.x == 0 && threadIdx.x == 0)` should no longer be created manually in the double-buffering pass, but instead, the double-buffering pass should leave the TMA op as-is, and the predicate generation pass will handle it.

csrc/device_lower/pass/double_buffer.cpp

rdspring1 · 2024-05-16T17:45:27Z

@drzejan2 I was reviewing this PR to see how it could be adapted for pointwise and normalization kernels. Is the PR in a usable state? What functionality is not yet implemented?

rdspring1 · 2024-10-16T18:15:28Z

Closed in lieu of #2833

drzejan2 force-pushed the ab/Hopper_double_buffering branch 3 times, most recently from a4a6204 to c112933 Compare December 15, 2023 11:58

drzejan2 force-pushed the ab/Hopper_double_buffering branch from c112933 to b50babf Compare December 18, 2023 09:23

drzejan2 force-pushed the ab/Hopper_double_buffering branch from eafcf15 to 0cedfc3 Compare December 27, 2023 09:17

drzejan2 force-pushed the ab/Hopper_double_buffering branch from 77df1a8 to 73706fe Compare January 9, 2024 15:51

drzejan2 force-pushed the ab/Hopper_double_buffering branch 2 times, most recently from 15a17a6 to 3740e5d Compare January 24, 2024 15:27

drzejan2 force-pushed the ab/Hopper_double_buffering branch from 3740e5d to 69c6c44 Compare February 1, 2024 08:28

drzejan2 force-pushed the ab/Hopper_double_buffering branch 6 times, most recently from f1ed7d2 to 992215a Compare February 20, 2024 09:22

drzejan2 force-pushed the ab/Hopper_double_buffering branch 4 times, most recently from b0fd2be to c94352a Compare February 22, 2024 09:50

drzejan2 force-pushed the ab/Hopper_double_buffering branch from c94352a to 6a0a656 Compare February 22, 2024 11:58

drzejan2 requested review from zasdfgbnm and naoyam February 22, 2024 11:58

zasdfgbnm reviewed Mar 5, 2024

View reviewed changes

zasdfgbnm mentioned this pull request Mar 11, 2024

Generate predicates for cp.async.bulk normally #1903

Merged

drzejan2 force-pushed the ab/Hopper_double_buffering branch from 6a0a656 to 3d26c04 Compare March 19, 2024 14:47

drzejan2 force-pushed the ab/Hopper_double_buffering branch from 3d26c04 to f8cb081 Compare March 25, 2024 17:31

rdspring1 reviewed May 7, 2024

View reviewed changes

csrc/device_lower/pass/double_buffer.cpp Outdated Show resolved Hide resolved

rdspring1 reviewed May 7, 2024

View reviewed changes

csrc/device_lower/pass/double_buffer.cpp Outdated Show resolved Hide resolved

drzejan2 force-pushed the ab/Hopper_double_buffering branch from f8cb081 to bbdc753 Compare May 24, 2024 12:05

drzejan2 added 4 commits May 24, 2024 14:08

[DB] TMA-based double buffer 1d/2d test cases

37ae0d8

[DB] Index and host side parts of TMA initialization, initial changes

c1eaec0

[DB] Allocations' for DB/CB synchonization, prepartions

0837fed

[DB] KIR mutator update in double buffering pass

6fb1869

drzejan2 force-pushed the ab/Hopper_double_buffering branch from bbdc753 to 6fb1869 Compare May 24, 2024 12:09

rdspring1 self-assigned this Jun 6, 2024

rdspring1 mentioned this pull request Aug 7, 2024

[Tracking] TMA Circular Buffering #2773

Closed

rdspring1 closed this Oct 16, 2024

zasdfgbnm deleted the ab/Hopper_double_buffering branch October 16, 2024 20:08

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Double buffering for Hopper arch #1484

Double buffering for Hopper arch #1484

drzejan2 commented Dec 8, 2023

drzejan2 commented Dec 27, 2023 •

edited

Loading

drzejan2 commented Feb 22, 2024 •

edited

Loading

naoyam commented Feb 22, 2024

zasdfgbnm Mar 5, 2024

rdspring1 Aug 21, 2024

zasdfgbnm Mar 5, 2024

drzejan2 Mar 6, 2024

zasdfgbnm Mar 5, 2024

drzejan2 Mar 6, 2024

zasdfgbnm Mar 5, 2024

drzejan2 Mar 6, 2024

rdspring1 commented May 16, 2024

rdspring1 commented Oct 16, 2024

Double buffering for Hopper arch #1484

Double buffering for Hopper arch #1484

Conversation

drzejan2 commented Dec 8, 2023

drzejan2 commented Dec 27, 2023 • edited Loading

drzejan2 commented Feb 22, 2024 • edited Loading

naoyam commented Feb 22, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

rdspring1 commented May 16, 2024

rdspring1 commented Oct 16, 2024

drzejan2 commented Dec 27, 2023 •

edited

Loading

drzejan2 commented Feb 22, 2024 •

edited

Loading