Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create dispatch system for executors #3263

Merged
merged 42 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
6aa977f
Broke apart executors, tests will fail, need dispatch.
csarofeen Oct 20, 2024
c23c38d
Change name: FusionExecutor -> KernelExecutor
csarofeen Oct 20, 2024
bf835ec
Draft executor dispatch.
csarofeen Oct 20, 2024
3d7f017
Rename fe->ke for benchmarks.
csarofeen Oct 20, 2024
61f7cb5
Fix instrumentation in executors.
csarofeen Oct 20, 2024
48fdaef
Fix build with executor dispatch (still test failures). Make sure Fus…
csarofeen Oct 20, 2024
a44018e
Finish rename FusionExecutor fe -> KernelExecutor ke
csarofeen Oct 21, 2024
0e10199
All but one nvfuser_tests pass.
csarofeen Oct 23, 2024
8ed49a1
No op scheduler generates empty kernels.
csarofeen Oct 24, 2024
b8c3118
Rename HostIrExecutor to HostIrEvaluator, move HostIRExecutor to mult…
csarofeen Oct 25, 2024
21340b4
Rename inputes/outputBytesAccessed to computeBytes
csarofeen Oct 25, 2024
84c2332
In KernelExecutor rename compileFusion and runFusion to comile and run
csarofeen Oct 25, 2024
075c293
Remove executor_abstract.cpp as it wasn't used
csarofeen Oct 25, 2024
9229abd
Revert whitespace changes.
csarofeen Oct 25, 2024
fc5d788
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Oct 25, 2024
fe05a3a
Fix merge conflicts.
csarofeen Oct 25, 2024
76a100d
Fix matmul tests.
csarofeen Oct 26, 2024
55b24f6
Fix executor in host ir evaluator for host unit, send it to KernelExe…
csarofeen Nov 4, 2024
d372ae2
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Nov 4, 2024
097674d
Merge fixes.
csarofeen Nov 4, 2024
c917725
clang tidy
csarofeen Nov 4, 2024
cb14bb9
profiler fix
csarofeen Nov 4, 2024
88a58cd
Allow serializing uncompiled KernelExecutor.
csarofeen Nov 4, 2024
d829e41
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Nov 5, 2024
38cce1e
Merge fixes.
csarofeen Nov 5, 2024
68e2692
Fix serialization errors in executor dispatch (#3353)
rdspring1 Nov 6, 2024
2749521
PR Comments part 1.
csarofeen Nov 6, 2024
198e6d4
PR Comments part 2.
csarofeen Nov 6, 2024
f4acdb7
Merge branch 'main' into executor_dispatch
naoyam Nov 6, 2024
6312789
Merge branch 'main' into executor_dispatch
naoyam Nov 6, 2024
3f2503f
Last PR Comment cleanup.
csarofeen Nov 6, 2024
8290e5c
Merge branch 'main' into executor_dispatch
naoyam Nov 7, 2024
dccaef0
Avoid clang-tidy warning
naoyam Nov 7, 2024
9b1a4e4
Merge branch 'main' into executor_dispatch
naoyam Nov 7, 2024
b33c8e1
cleanup
naoyam Nov 8, 2024
0c8043f
cleanup
naoyam Nov 8, 2024
b1634f3
cleanup
naoyam Nov 8, 2024
6df7ae1
Merge branch 'main' into executor_dispatch
naoyam Nov 8, 2024
6ebc56f
cleanup
naoyam Nov 8, 2024
f10c3af
typo
naoyam Nov 8, 2024
5e24332
Set `fusion_id` and `device_id` of `KernelExecutor` in constructor fo…
rdspring1 Nov 8, 2024
fa68935
Merge branch 'main' into executor_dispatch
naoyam Nov 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/rng.cpp
${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
${NVFUSER_SRCS_DIR}/runtime/executor.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_abstract.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_dispatch.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_kernel_arg.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_params.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_utils.cpp
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_first.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -102,7 +102,7 @@ static void NvFuserScheduler_BatchNorm(
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_first_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -115,7 +115,7 @@ static void NvFuserScheduler_BatchNorm_BWD(
std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_last.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm_nhwc(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -103,7 +103,7 @@ static void NvFuserScheduler_BatchNorm_nhwc(
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/batch_norm_channels_last_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BatchNorm_nhwc_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);

Expand All @@ -116,7 +116,7 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
24 changes: 12 additions & 12 deletions benchmarks/cpp/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_DivMaxSoftDropFwd(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto w = benchmark_state.range(0);
auto x = benchmark_state.range(1);
Expand All @@ -135,15 +135,15 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
std::vector<c10::IValue> at_inputs = {t0, t1};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}

static void NvFuserScheduler_DivMaxSoftDropBwd(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto w = benchmark_state.range(0);
auto x = benchmark_state.range(1);
Expand All @@ -162,7 +162,7 @@ static void NvFuserScheduler_DivMaxSoftDropBwd(
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

// Some reason t1 isn't used, ignore it.
bytes -=
Expand Down Expand Up @@ -228,7 +228,7 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -247,7 +247,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -304,7 +304,7 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -322,7 +322,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -380,7 +380,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -398,7 +398,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -438,7 +438,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {

static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype) {
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
Expand All @@ -454,7 +454,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
std::vector<c10::IValue> at_inputs = {t0, t21};

auto bytes =
runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) {

static void NvFuserScheduler_Broadcast(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype,
int bcast_dim) {
auto bcast_size = benchmark_state.range(0);
Expand All @@ -74,7 +74,7 @@ static void NvFuserScheduler_Broadcast(

std::vector<c10::IValue> aten_inputs({t0, t1});

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
24 changes: 12 additions & 12 deletions benchmarks/cpp/gelu_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ static void NvFuserScheduler_GeluBackward_Compile(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

for (auto _ : benchmark_state) {
FusionExecutor executor;
executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
KernelExecutor ke;
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
}
}

Expand All @@ -187,13 +187,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion(
auto heuristic_params = SchedulerEntry::scheduleWith(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

FusionExecutor executor;
executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
KernelExecutor ke;
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);

C10_CUDA_CHECK(cudaDeviceSynchronize());

for (auto _ : benchmark_state) {
outputs = executor.runFusion(
outputs = ke.runFusion(
c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
Expand All @@ -218,11 +218,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
auto heuristic_params = SchedulerEntry::scheduleWith(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

FusionExecutor executor;
executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
KernelExecutor ke;
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);

runBenchmarkIterations(
benchmark_state, &executor, inputs, heuristic_params->lparams);
benchmark_state, &ke, inputs, heuristic_params->lparams);
}

BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly)
Expand All @@ -247,12 +247,12 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(
auto heuristic_params = SchedulerEntry::scheduleWith(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

FusionExecutor executor;
executor.setExecuteKernelFlag(false);
executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
KernelExecutor ke;
ke.setExecuteKernelFlag(false);
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);

for (auto _ : benchmark_state) {
outputs = executor.runFusion(
outputs = ke.runFusion(
c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
}
}
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/gelu_backward_reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ static void setupGeluBackwardReduction(

static void NvFuserScheduler_GeluBackwardReduction(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
FusionExecutorCache* executor_cache,
DataType dtype,
int reduction_dim) {
auto reduction_size = benchmark_state.range(0);
Expand All @@ -112,7 +112,7 @@ static void NvFuserScheduler_GeluBackwardReduction(

std::vector<c10::IValue> aten_inputs = {aten_input_grad, aten_input_x};

runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);

// inputs: gradient tensor + input tensor
// outputs: output, output_of_reduction
Expand Down
24 changes: 12 additions & 12 deletions benchmarks/cpp/heuristic_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ using namespace nvfuser;

static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Expand Down Expand Up @@ -84,12 +84,12 @@ static auto getLayerBackwardNormRuntime(
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);

fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return fec->getMostRecentKernelRuntime();
return executor_cache->getMostRecentKernelRuntime();
}

static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
Expand All @@ -98,14 +98,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
FusionGuard fg(fusion_ptr.get());

// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::unique_ptr<FusionExecutorCache> executor_cache;
std::vector<c10::IValue> aten_inputs;

std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};

auto runtime = getLayerBackwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);

KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
Expand All @@ -120,7 +120,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(

static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Expand All @@ -141,11 +141,11 @@ static auto getLayerForwardNormRuntime(
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);

fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return fec->getMostRecentKernelRuntime();
return executor_cache->getMostRecentKernelRuntime();
}

static void NvFuserScheduler_LayerNormForward_HeuristicCache(
Expand All @@ -154,14 +154,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicCache(
FusionGuard fg(fusion_ptr.get());

// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::unique_ptr<FusionExecutorCache> executor_cache;
std::vector<c10::IValue> aten_inputs;

std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};

auto runtime = getLayerForwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);

KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
Expand Down
Loading