Skip to content

Commit

Permalink
merge and update
Browse files Browse the repository at this point in the history
  • Loading branch information
cowanmeg committed Dec 12, 2024
2 parents c6b4c59 + 5716c09 commit 9852150
Show file tree
Hide file tree
Showing 387 changed files with 31,735 additions and 14,023 deletions.
2 changes: 1 addition & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ cppcoreguidelines-*,
-cppcoreguidelines-pro-type-vararg,
-cppcoreguidelines-special-member-functions,
-cppcoreguidelines-non-private-member-variables-in-classes,
-cppcoreguidelines-avoid-goto,
-facebook-hte-RelativeInclude,
hicpp-exception-baseclass,
hicpp-avoid-goto,
misc-unused-alias-decls,
misc-unused-using-decls,
modernize-*,
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nvfuser-ci-trigger.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

# This job only runs for pull request comments
if: |
startsWith(github.event.comment.body, '!build') &&
( startsWith(github.event.comment.body, '!build') || startsWith(github.event.comment.body, '!test') ) &&
(github.actor == 'xwang233' || github.actor == 'jjsjann123' || github.actor == 'chang-l' || github.actor == 'csarofeen' || github.actor == 'drzejan2' || github.actor == 'IvanYashchuk' || github.actor == 'jacobhinkle' || github.actor == 'kevinstephano' || github.actor == 'liqiangxl' || github.actor == 'mmigdal-nv' || github.actor == 'naoyam' || github.actor == 'ptrblck' || github.actor == 'rdspring1' || github.actor == 'samnordmann' || github.actor == 'zasdfgbnm' || github.actor == 'crcrpar' || github.actor == 'nWEIdia' || github.actor == 'Priya2698' || github.actor == 'wujingyue' || github.actor == 'tfogal' || github.actor == 'protonu' || github.actor == 'cowanmeg' || github.actor == 'nsarka')
steps:
- name: Check if comment is issued by authorized person
Expand Down
25 changes: 25 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause

# A workflow to send CI-related helpful information to PRs
name: pull
on:
pull_request:

run-name: CI status hello ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.head.sha }}
jobs:
status_hello:
name: send CI hello status
runs-on: ubuntu-latest
permissions:
statuses: write
steps:
- name: Set CI hello status
run: |
curl \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
-d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
54 changes: 14 additions & 40 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")

option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF)
if (NVFUSER_EXPLICIT_ERROR_CHECK)
add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK)
endif()
option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)

include(CMakeDependentOption)
Expand Down Expand Up @@ -188,15 +192,18 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/preseg_passes/insert_reshardings.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/make_resharding_contiguous.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/mark_aliases_prepare.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/move_pad.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/move_split_cat.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/pre_segmenter.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/propagate_shardings.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/remove_bcast_squeeze.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
${NVFUSER_SRCS_DIR}/rng.cpp
${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
${NVFUSER_SRCS_DIR}/runtime/executor.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_dispatch.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_kernel_arg.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_params.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_utils.cpp
Expand All @@ -208,6 +215,8 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/scheduler/mark_aliases.cpp
${NVFUSER_SRCS_DIR}/scheduler/matmul.cpp
${NVFUSER_SRCS_DIR}/scheduler/multi_matmul.cpp
${NVFUSER_SRCS_DIR}/scheduler/ampere_multi_matmul.cpp
${NVFUSER_SRCS_DIR}/scheduler/hopper_multi_matmul.cpp
${NVFUSER_SRCS_DIR}/scheduler/matmul_heuristic_plugin.cpp
${NVFUSER_SRCS_DIR}/scheduler/matmul_utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
Expand All @@ -227,6 +236,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/scheduler/tools/inlining.cpp
${NVFUSER_SRCS_DIR}/scheduler/tools/loop_domain_scheduler.cpp
${NVFUSER_SRCS_DIR}/scheduler/tools/maxinfo_propagator.cpp
${NVFUSER_SRCS_DIR}/scheduler/tools/resize_utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp
${NVFUSER_SRCS_DIR}/scheduler/utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp
Expand Down Expand Up @@ -262,6 +272,7 @@ if(BUILD_PYTHON)
${NVFUSER_SRCS_DIR}/python_frontend/fusion_cache.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_definition.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_state.cpp
${NVFUSER_SRCS_DIR}/python_frontend/segmentation.cpp
${NVFUSER_SRCS_DIR}/python_frontend/translation.cpp
${NVFUSER_SRCS_DIR}/python_frontend/translation_utils.cpp
${NVFUSER_SRCS_DIR}/serde/fusion_record.cpp
Expand Down Expand Up @@ -541,13 +552,16 @@ list(APPEND JIT_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp
${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp
${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp
${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp
${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp
${NVFUSER_ROOT}/tests/cpp/test_loop_rotation.cpp
${NVFUSER_ROOT}/tests/cpp/test_mbarrier.cpp
${NVFUSER_ROOT}/tests/cpp/test_memory.cpp
${NVFUSER_ROOT}/tests/cpp/test_move_split_cat.cpp
${NVFUSER_ROOT}/tests/cpp/test_move_pad.cpp
${NVFUSER_ROOT}/tests/cpp/test_mutator.cpp
${NVFUSER_ROOT}/tests/cpp/test_no_op.cpp
${NVFUSER_ROOT}/tests/cpp/test_persistent_buffer.cpp
${NVFUSER_ROOT}/tests/cpp/test_pointwise.cpp
Expand Down Expand Up @@ -763,46 +777,6 @@ if(BUILD_NVFUSER_BENCHMARK)
-Werror -Wno-deprecated-copy
)
endif()

# multidevice benchmarks
if (NVFUSER_DISTRIBUTED)
set(MULTIDEVICE_BENCHMARK_SRCS)
list(APPEND MULTIDEVICE_BENCHMARK_SRCS
${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp
${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp
${NVFUSER_ROOT}/tests/cpp/utils.cpp
)

add_executable(nvfuser_multidevice_bench ${MULTIDEVICE_BENCHMARK_SRCS})
set_target_properties(nvfuser_multidevice_bench PROPERTIES
C_STANDARD ${NVFUSER_C_STANDARD}
CUDA_STANDARD ${NVFUSER_CUDA_STANDARD}
CXX_STANDARD ${NVFUSER_CPP_STANDARD}
CXX_STANDARD_REQUIRED ON
CXX_VISIBILITY_PRESET hidden
POSITION_INDEPENDENT_CODE Yes
VISIBILITY_INLINES_HIDDEN Yes
)

target_include_directories(nvfuser_multidevice_bench SYSTEM PRIVATE
${CMAKE_SOURCE_DIR}/third_party/benchmark/include
${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
${CMAKE_SOURCE_DIR}/third_party/googletest/googletest/include
)
target_include_directories(nvfuser_multidevice_bench PUBLIC ${NVFUSER_ROOT})
target_link_libraries(nvfuser_multidevice_bench PRIVATE
benchmark::benchmark
codegen_internal
)
add_dependencies(nvfuser_multidevice_bench flatc build_flatbuffer_config)

if(NOT MSVC)
target_compile_options(nvfuser_bench PRIVATE
-Wall -Wno-unused-function
-Werror -Wno-deprecated-copy
)
endif()
endif()
endif()

# --- generate runtime files
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
## Installation

We publish nightly wheel packages on https://pypi.nvidia.com, while build against stable torch version on https://pypi.org.
**Wheels are published for Python version: _3.10_, _3.12_**.

built-env | cuda 12.1
:---: | :---:
torch 2.3 | nvfuser-cu121-torch23
torch 2.4 | nvfuser-cu121-torch24
torch nightly wheel | nvfuser-cu121
built-env | cuda 11.8 | cuda 12.1 | cuda 12.4
:---: | :---: | :---: | :---:
torch 2.5 (pypi.org)| nvfuser-cu118-torch25 | nvfuser-cu121-torch25 | nvfuser-cu124-torch25
torch nightly (pypi.nvidia.com) | nvfuser-cu118 | nvfuser-cu121 | nvfuser-cu124

Note that nvfuser built against stable torch version isn't compatible with nightly pytorch wheel, so ensure you pick the right version suiting your environment.

Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_first.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -102,7 +102,7 @@ static void NvFuserScheduler_BatchNorm(
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_first_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -115,7 +115,7 @@ static void NvFuserScheduler_BatchNorm_BWD(
std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_last.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm_nhwc(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -103,7 +103,7 @@ static void NvFuserScheduler_BatchNorm_nhwc(
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_last_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm_nhwc_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -116,7 +116,7 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
24 changes: 12 additions & 12 deletions benchmarks/cpp/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_DivMaxSoftDropFwd(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto w = benchmark_state.range(0);
auto x = benchmark_state.range(1);
Expand All @@ -135,15 +135,15 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
std::vector<c10::IValue> at_inputs = {t0, t1};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}

static void NvFuserScheduler_DivMaxSoftDropBwd(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto w = benchmark_state.range(0);
auto x = benchmark_state.range(1);
Expand All @@ -162,7 +162,7 @@ static void NvFuserScheduler_DivMaxSoftDropBwd(
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

// Some reason t1 isn't used, ignore it.
bytes -=
Expand Down Expand Up @@ -228,7 +228,7 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -247,7 +247,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -304,7 +304,7 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -322,7 +322,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -380,7 +380,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -398,7 +398,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -438,7 +438,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -454,7 +454,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
std::vector<c10::IValue> at_inputs = {t0, t21};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) {

static void NvFuserScheduler_Broadcast(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype,
int bcast_dim) {
auto bcast_size = benchmark_state.range(0);
Expand All @@ -74,7 +74,7 @@ static void NvFuserScheduler_Broadcast(

std::vector<c10::IValue> aten_inputs({t0, t1});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
Loading

0 comments on commit 9852150

Please sign in to comment.