From e96a63a3818ea9abba4e4b1f7d61697f25467eb6 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:21:38 -0500
Subject: [PATCH] use bdimy = 1 to WAR smem race (#3423)

when total_reduction_numel <= 1024, scheduler may use multiple
reductions per block with bdimy > 1, this leads to race condition in
shared memory when using async copy. Adding `cp.async.wait_all`after the
1st async copy can avoid the race, but needs to figure out the root
cause before we can safely use it. So, here we set bdimy = 1 as a WAR.
Should be reverted after https://github.com/NVIDIA/Fuser/pull/3438 is merged.

race detected with:
```
NVFUSER_DUMP=scheduler_params,cuda_to_file NVFUSER_ENABLE=kernel_debug PYTORCH_NO_CUDA_MEMORY_CACHING=1 compute-sanitizer --tool racecheck --racecheck-detect-level info  ./nvfuser_tests --gtest_filter='CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_96'
```
---
 csrc/scheduler/normalization_inner_outer.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/csrc/scheduler/normalization_inner_outer.cpp b/csrc/scheduler/normalization_inner_outer.cpp
index 8f02d0bdf1d..b32f8873ed1 100644
--- a/csrc/scheduler/normalization_inner_outer.cpp
+++ b/csrc/scheduler/normalization_inner_outer.cpp
@@ -657,24 +657,18 @@ std::unique_ptr<ReductionParams> innerOuterPersistentHeuristic(
   if (inner_dim_numel <= 1024) {
     rparams->multiple_reds_per_blk = true;
     rparams->tidx_for_outer_reduction = true;
-    constexpr int64_t threads_per_block_mrpb = 512;
 
     // Step-1, InnerParams, Reduction dim: inner_vect(reuse),
     // inner_batch(reuse), bdimx
     iop.bdimx = ceilDiv(inner_dim_numel, iop.inner_vect * iop.inner_batch);
 
     // Step-2, InnerParams, Iteration dim: gdimy, bdimy (in next step)
-    iop.gdimy =
-        getGdimy(iop.inner_vect, threads_per_block_mrpb, iop.inner_batch);
+    iop.gdimy = getGdimy(iop.inner_vect, iop.bdimx, iop.inner_batch);
 
     // Step-3, OuterParams, Iteration dim: vectorization_factor_outer(reuse),
-    // bdimy, gdimy (in previous step). We prefer bdimy to be larger enough to
-    // cover what is left in both the outer_dim and inner_dim. However, it
-    // should not exceed the limitation set by threads_per_block_mrpb.
-    int64_t bdimy_tmp = std::max(
-        ceilDiv(outer_dim_numel, iop.gdimy),
-        ceilDiv(inner_dim_numel, iop.vectorization_factor_outer * iop.gdimy));
-    iop.bdimy = std::min(threads_per_block_mrpb / iop.bdimx, bdimy_tmp);
+    // bdimy, gdimy (in previous step).
+    // WAR for https://github.com/NVIDIA/Fuser/issues/3428
+    iop.bdimy = 1;
 
     // Step-4, OuterParams, Reduction dim: bdimx (already done)
     iop.warps_per_sm = ceilDiv(iop.bdimx * iop.bdimy, dev_prop->warpSize) *