From e96a63a3818ea9abba4e4b1f7d61697f25467eb6 Mon Sep 17 00:00:00 2001 From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:21:38 -0500 Subject: [PATCH] use bdimy = 1 to WAR smem race (#3423) when total_reduction_numel <= 1024, scheduler may use multiple reductions per block with bdimy > 1, this leads to race condition in shared memory when using async copy. Adding `cp.async.wait_all`after the 1st async copy can avoid the race, but needs to figure out the root cause before we can safely use it. So, here we set bdimy = 1 as a WAR. Should be reverted after https://github.com/NVIDIA/Fuser/pull/3438 is merged. race detected with: ``` NVFUSER_DUMP=scheduler_params,cuda_to_file NVFUSER_ENABLE=kernel_debug PYTORCH_NO_CUDA_MEMORY_CACHING=1 compute-sanitizer --tool racecheck --racecheck-detect-level info ./nvfuser_tests --gtest_filter='CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_96' ``` --- csrc/scheduler/normalization_inner_outer.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/csrc/scheduler/normalization_inner_outer.cpp b/csrc/scheduler/normalization_inner_outer.cpp index 8f02d0bdf1d..b32f8873ed1 100644 --- a/csrc/scheduler/normalization_inner_outer.cpp +++ b/csrc/scheduler/normalization_inner_outer.cpp @@ -657,24 +657,18 @@ std::unique_ptr innerOuterPersistentHeuristic( if (inner_dim_numel <= 1024) { rparams->multiple_reds_per_blk = true; rparams->tidx_for_outer_reduction = true; - constexpr int64_t threads_per_block_mrpb = 512; // Step-1, InnerParams, Reduction dim: inner_vect(reuse), // inner_batch(reuse), bdimx iop.bdimx = ceilDiv(inner_dim_numel, iop.inner_vect * iop.inner_batch); // Step-2, InnerParams, Iteration dim: gdimy, bdimy (in next step) - iop.gdimy = - getGdimy(iop.inner_vect, threads_per_block_mrpb, iop.inner_batch); + iop.gdimy = getGdimy(iop.inner_vect, iop.bdimx, iop.inner_batch); // Step-3, OuterParams, Iteration dim: vectorization_factor_outer(reuse), - // bdimy, gdimy (in previous step). We prefer bdimy to be larger enough to - // cover what is left in both the outer_dim and inner_dim. However, it - // should not exceed the limitation set by threads_per_block_mrpb. - int64_t bdimy_tmp = std::max( - ceilDiv(outer_dim_numel, iop.gdimy), - ceilDiv(inner_dim_numel, iop.vectorization_factor_outer * iop.gdimy)); - iop.bdimy = std::min(threads_per_block_mrpb / iop.bdimx, bdimy_tmp); + // bdimy, gdimy (in previous step). + // WAR for https://github.com/NVIDIA/Fuser/issues/3428 + iop.bdimy = 1; // Step-4, OuterParams, Reduction dim: bdimx (already done) iop.warps_per_sm = ceilDiv(iop.bdimx * iop.bdimy, dev_prop->warpSize) *