merge and update

NVIDIA · Dec 12, 2024 · 9852150 · 9852150
2 parents c6b4c59 + 5716c09
commit 9852150
Show file tree

Hide file tree

Showing 387 changed files with 31,735 additions and 14,023 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -26,9 +26,9 @@ cppcoreguidelines-*,
 -cppcoreguidelines-pro-type-vararg,
 -cppcoreguidelines-special-member-functions,
 -cppcoreguidelines-non-private-member-variables-in-classes,
+-cppcoreguidelines-avoid-goto,
 -facebook-hte-RelativeInclude,
 hicpp-exception-baseclass,
-hicpp-avoid-goto,
 misc-unused-alias-decls,
 misc-unused-using-decls,
 modernize-*,

diff --git a/.github/workflows/nvfuser-ci-trigger.yml b/.github/workflows/nvfuser-ci-trigger.yml
@@ -16,7 +16,7 @@ jobs:
 
     # This job only runs for pull request comments
     if: |
-         startsWith(github.event.comment.body, '!build') &&
+         ( startsWith(github.event.comment.body, '!build') || startsWith(github.event.comment.body, '!test') ) &&
          (github.actor == 'xwang233' || github.actor == 'jjsjann123' || github.actor == 'chang-l' || github.actor == 'csarofeen' || github.actor == 'drzejan2' || github.actor == 'IvanYashchuk' || github.actor == 'jacobhinkle' || github.actor == 'kevinstephano' || github.actor == 'liqiangxl' || github.actor == 'mmigdal-nv' || github.actor == 'naoyam' || github.actor == 'ptrblck' || github.actor == 'rdspring1' || github.actor == 'samnordmann' || github.actor == 'zasdfgbnm' || github.actor == 'crcrpar' || github.actor == 'nWEIdia' || github.actor == 'Priya2698' || github.actor == 'wujingyue' || github.actor == 'tfogal' || github.actor == 'protonu' || github.actor == 'cowanmeg' || github.actor == 'nsarka')
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# A workflow to send CI-related helpful information to PRs
+name: pull
+on:
+  pull_request:
+
+run-name: CI status hello ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.head.sha }}
+jobs:
+  status_hello:
+    name: send CI hello status
+    runs-on: ubuntu-latest
+    permissions:
+      statuses: write
+    steps:
+      - name: Set CI hello status
+        run: |
+          curl \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+          https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
+          -d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,6 +13,10 @@ set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
 set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
 
 option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
+option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF)
+if (NVFUSER_EXPLICIT_ERROR_CHECK)
+  add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK)
+endif()
 option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)
 
 include(CMakeDependentOption)
@@ -188,15 +192,18 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/insert_reshardings.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/make_resharding_contiguous.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/mark_aliases_prepare.cpp
+  ${NVFUSER_SRCS_DIR}/preseg_passes/move_pad.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/move_split_cat.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/pre_segmenter.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/propagate_shardings.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_bcast_squeeze.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
+  ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
   ${NVFUSER_SRCS_DIR}/rng.cpp
   ${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor.cpp
+  ${NVFUSER_SRCS_DIR}/runtime/executor_dispatch.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor_kernel_arg.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor_params.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor_utils.cpp
@@ -208,6 +215,8 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/scheduler/mark_aliases.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/matmul.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/multi_matmul.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/ampere_multi_matmul.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/hopper_multi_matmul.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/matmul_heuristic_plugin.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/matmul_utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
@@ -227,6 +236,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/scheduler/tools/inlining.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/tools/loop_domain_scheduler.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/tools/maxinfo_propagator.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/tools/resize_utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp
@@ -262,6 +272,7 @@ if(BUILD_PYTHON)
     ${NVFUSER_SRCS_DIR}/python_frontend/fusion_cache.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/fusion_definition.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/fusion_state.cpp
+    ${NVFUSER_SRCS_DIR}/python_frontend/segmentation.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/translation.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/translation_utils.cpp
     ${NVFUSER_SRCS_DIR}/serde/fusion_record.cpp
@@ -541,13 +552,16 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_loop_rotation.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_mbarrier.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_memory.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_move_split_cat.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_move_pad.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_mutator.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_no_op.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_persistent_buffer.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_pointwise.cpp
@@ -763,46 +777,6 @@ if(BUILD_NVFUSER_BENCHMARK)
       -Werror -Wno-deprecated-copy
     )
   endif()
-
-  # multidevice benchmarks
-  if (NVFUSER_DISTRIBUTED)
-    set(MULTIDEVICE_BENCHMARK_SRCS)
-    list(APPEND MULTIDEVICE_BENCHMARK_SRCS
-      ${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp
-      ${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp
-      ${NVFUSER_ROOT}/tests/cpp/utils.cpp
-    )
-
-    add_executable(nvfuser_multidevice_bench ${MULTIDEVICE_BENCHMARK_SRCS})
-    set_target_properties(nvfuser_multidevice_bench PROPERTIES
-      C_STANDARD ${NVFUSER_C_STANDARD}
-      CUDA_STANDARD ${NVFUSER_CUDA_STANDARD}
-      CXX_STANDARD ${NVFUSER_CPP_STANDARD}
-      CXX_STANDARD_REQUIRED ON
-      CXX_VISIBILITY_PRESET hidden
-      POSITION_INDEPENDENT_CODE Yes
-      VISIBILITY_INLINES_HIDDEN Yes
-    )
-
-    target_include_directories(nvfuser_multidevice_bench SYSTEM PRIVATE
-      ${CMAKE_SOURCE_DIR}/third_party/benchmark/include
-      ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
-      ${CMAKE_SOURCE_DIR}/third_party/googletest/googletest/include
-    )
-    target_include_directories(nvfuser_multidevice_bench PUBLIC ${NVFUSER_ROOT})
-    target_link_libraries(nvfuser_multidevice_bench PRIVATE
-      benchmark::benchmark
-      codegen_internal
-    )
-    add_dependencies(nvfuser_multidevice_bench flatc build_flatbuffer_config)
-
-    if(NOT MSVC)
-    target_compile_options(nvfuser_bench PRIVATE
-      -Wall -Wno-unused-function
-      -Werror -Wno-deprecated-copy
-    )
-    endif()
-  endif()
 endif()
 
 # --- generate runtime files

diff --git a/README.md b/README.md
@@ -11,12 +11,12 @@ A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
 ## Installation
 
 We publish nightly wheel packages on https://pypi.nvidia.com, while build against stable torch version on https://pypi.org.
+**Wheels are published for Python version: _3.10_, _3.12_**.
 
-built-env | cuda 12.1
-:---: | :---:
-torch 2.3 | nvfuser-cu121-torch23
-torch 2.4 | nvfuser-cu121-torch24
-torch nightly wheel | nvfuser-cu121
+built-env | cuda 11.8 | cuda 12.1 | cuda 12.4
+:---: | :---: | :---: | :---:
+torch 2.5 (pypi.org)| nvfuser-cu118-torch25 | nvfuser-cu121-torch25 | nvfuser-cu124-torch25
+torch nightly (pypi.nvidia.com) | nvfuser-cu118 | nvfuser-cu121 | nvfuser-cu124
 
 Note that nvfuser built against stable torch version isn't compatible with nightly pytorch wheel, so ensure you pick the right version suiting your environment.
 

diff --git a/benchmarks/cpp/batch_norm_channels_first.cpp b/benchmarks/cpp/batch_norm_channels_first.cpp
@@ -78,7 +78,7 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -102,7 +102,7 @@ static void NvFuserScheduler_BatchNorm(
   std::vector<c10::IValue> aten_inputs(
       {at_x, at_weight, at_bias, at_run_mean, at_run_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/batch_norm_channels_first_backward.cpp
@@ -89,7 +89,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -115,7 +115,7 @@ static void NvFuserScheduler_BatchNorm_BWD(
   std::vector<c10::IValue> aten_inputs(
       {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_last.cpp b/benchmarks/cpp/batch_norm_channels_last.cpp
@@ -79,7 +79,7 @@ static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_nhwc(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -103,7 +103,7 @@ static void NvFuserScheduler_BatchNorm_nhwc(
   std::vector<c10::IValue> aten_inputs(
       {at_x, at_weight, at_bias, at_run_mean, at_run_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/batch_norm_channels_last_backward.cpp
@@ -90,7 +90,7 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_nhwc_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -116,7 +116,7 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
   std::vector<c10::IValue> aten_inputs(
       {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/bert.cpp b/benchmarks/cpp/bert.cpp
@@ -118,7 +118,7 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_DivMaxSoftDropFwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto w = benchmark_state.range(0);
   auto x = benchmark_state.range(1);
@@ -135,15 +135,15 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
   std::vector<c10::IValue> at_inputs = {t0, t1};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
 }
 
 static void NvFuserScheduler_DivMaxSoftDropBwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto w = benchmark_state.range(0);
   auto x = benchmark_state.range(1);
@@ -162,7 +162,7 @@ static void NvFuserScheduler_DivMaxSoftDropBwd(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   // Some reason t1 isn't used, ignore it.
   bytes -=
@@ -228,7 +228,7 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -247,7 +247,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -304,7 +304,7 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -322,7 +322,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -380,7 +380,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -398,7 +398,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
   std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -438,7 +438,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -454,7 +454,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
   std::vector<c10::IValue> at_inputs = {t0, t21};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));

diff --git a/benchmarks/cpp/broadcast.cpp b/benchmarks/cpp/broadcast.cpp
@@ -56,7 +56,7 @@ static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) {
 
 static void NvFuserScheduler_Broadcast(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int bcast_dim) {
   auto bcast_size = benchmark_state.range(0);
@@ -74,7 +74,7 @@ static void NvFuserScheduler_Broadcast(
 
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *