Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into resize_scheduler
Browse files Browse the repository at this point in the history
  • Loading branch information
naoyam committed Dec 13, 2024
2 parents 8c4b919 + 7d1b64b commit 498b5f9
Show file tree
Hide file tree
Showing 117 changed files with 4,934 additions and 2,112 deletions.
13 changes: 13 additions & 0 deletions benchmarks/python/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from nvfuser import FusionDefinition, FusionCache
from nvfuser.pytorch_utils import DEVICE_PROPERTIES
import warnings
import thunder
from thunder.executors.nvfuserex import nvfuserex


# These variables can be overwritten through CLI commands
# --benchmark-rounds=rounds --benchmark-warmup-rounds=warmup_rounds
Expand Down Expand Up @@ -44,6 +47,16 @@ def unary_bwd_torch(inputs: List): # [output, grad_out]
inputs[0].backward(inputs[1], retain_graph=True)


def with_executor(executor: str, fwd_fn: Callable) -> Callable:
assert executor in ["eager", "torchcompile", "thunder"]
if executor == "eager":
return fwd_fn
if executor == "torchcompile":
return torch.compile(fwd_fn)
if executor == "thunder":
return thunder.jit(fwd_fn, nv_enable_bookend=False, executors=[nvfuserex])


def compute_total_iobytes(
tensor_props: dict[str, tuple[int | tuple[int, ...], torch.dtype]]
):
Expand Down
11 changes: 6 additions & 5 deletions benchmarks/python/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .global_params import PROMOTE_DTYPES
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
import torch
from .core import run_benchmark, unary_bwd_torch, clear_dynamo_cache
from .core import run_benchmark, unary_bwd_torch, clear_dynamo_cache, with_executor
import numpy as np


Expand Down Expand Up @@ -453,12 +453,12 @@ def norm_fwd_baseline_benchmark(

norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn

benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
benchmark_fn = with_executor(executor, norm_fwd_fn)

# Manually compute IOBytes: See PR #1725
run_benchmark(
benchmark,
benchmark_fn[executor],
benchmark_fn,
[inputs, weight, bias, running_mean, running_var],
iobytes=norm_fwd_iobytes(size, dtype, norm),
)
Expand Down Expand Up @@ -493,8 +493,9 @@ def norm_bwd_baseline_benchmark(
norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn

# Compile the fwd fn for torchcompile
fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var])
fwd_fn = with_executor(executor, norm_fwd_fn)
fwd_inputs = [inputs, weight, bias, running_mean, running_var]
outputs = fwd_fn(fwd_inputs)

# Manually compute IOBytes: See PR #1725
run_benchmark(
Expand Down
9 changes: 3 additions & 6 deletions benchmarks/python/test_broadcast_add_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -112,14 +112,11 @@ def test_bcast_add_baseline_benchmark(
x = x.t()
assert x.is_contiguous() == contiguous

benchmark_fn = {
"eager": bcast_add_fwd_fn,
"torchcompile": torch.compile(bcast_add_fwd_fn),
}
benchmark_fn = with_executor(executor, bcast_add_fwd_fn)

# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
benchmark,
benchmark_fn[executor],
benchmark_fn,
[bias, x, bcast_axis],
)
18 changes: 5 additions & 13 deletions benchmarks/python/test_dropout_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
clear_dynamo_cache,
unary_bwd_torch,
compute_total_iobytes,
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import dropout_layernorm


def dropout_layernorm_bwd_fusion(
Expand Down Expand Up @@ -208,20 +210,10 @@ def test_dropout_layernorm_bwd_baseline_benchmark(
weights = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)
bias = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)

def dropout_layernorm_fwd():
return torch.nn.functional.layer_norm(
input2 + torch.nn.functional.dropout(input1, p=dropout_p),
normalized_shape=input1.shape[1:],
weight=weights,
bias=bias,
)

# Compile the fwd fn for torchcompile
fwd_fn = {
"eager": dropout_layernorm_fwd,
"torchcompile": torch.compile(dropout_layernorm_fwd),
}
outputs = fwd_fn[executor]()
fwd_fn = with_executor(executor, dropout_layernorm)
fwd_inputs = [input1, input2, weights, bias, dropout_p]
outputs = fwd_fn(fwd_inputs)

# Manually compute IOBytes: See PR #1725
run_benchmark(
Expand Down
20 changes: 4 additions & 16 deletions benchmarks/python/test_dropout_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
run_benchmark,
clear_dynamo_cache,
compute_total_iobytes,
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import dropout_layernorm


def dropout_layernorm_fwd_fusion(
Expand Down Expand Up @@ -73,17 +75,6 @@ def dropout_layernorm_fwd_fusion(
fd.add_output(T10)


def dropout_layernorm_fwd(
inputs: list,
): # [in_tensor1, in_tensor2, weights, bias, dropout_p]
return torch.nn.functional.layer_norm(
inputs[1] + torch.nn.functional.dropout(inputs[0], p=inputs[-1]),
normalized_shape=inputs[0].shape[1:],
weight=inputs[2],
bias=inputs[3],
)


def dropout_layernorm_fwd_iobytes(size: tuple, dtype: torch.dtype):
# Manual IOByte computation is required since nvFuser outputs differ from baseline outputs (output).
nvf_inp_out = {
Expand Down Expand Up @@ -181,15 +172,12 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
dropout_p,
]

benchmark_fn = {
"eager": dropout_layernorm_fwd,
"torchcompile": torch.compile(dropout_layernorm_fwd),
}
benchmark_fn = with_executor(executor, dropout_layernorm)

# Manually compute IOBytes: See PR #1725
run_benchmark(
benchmark,
benchmark_fn[executor],
benchmark_fn,
inputs,
iobytes=dropout_layernorm_fwd_iobytes(size, dtype),
)
15 changes: 5 additions & 10 deletions benchmarks/python/test_dropout_rmsnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
clear_dynamo_cache,
unary_bwd_torch,
compute_total_iobytes,
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import dropout_rmsnorm


def dropout_rmsnorm_bwd_fusion(
Expand Down Expand Up @@ -186,16 +188,9 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark(
grads = torch.randn(size, device="cuda", dtype=dtype)
weights = torch.randn(size[1], device="cuda", dtype=dtype)

def dropout_rmsnorm_fwd():
x = input2 + torch.nn.functional.dropout(input1, p=dropout_p)
output = weights * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5)
return output

fwd_fn = {
"eager": dropout_rmsnorm_fwd,
"torchcompile": torch.compile(dropout_rmsnorm_fwd),
}
outputs = fwd_fn[executor]()
fwd_fn = with_executor(executor, dropout_rmsnorm)
fwd_inputs = [input1, input2, weights, dropout_p]
outputs = fwd_fn(fwd_inputs)

run_benchmark(
benchmark,
Expand Down
15 changes: 4 additions & 11 deletions benchmarks/python/test_dropout_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
run_benchmark,
clear_dynamo_cache,
compute_total_iobytes,
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import dropout_rmsnorm


def dropout_rmsnorm_fwd_fusion(
Expand Down Expand Up @@ -80,12 +82,6 @@ def dropout_rmsnorm_fwd_fusion(
fd.add_output(T28)


def dropout_rmsnorm_fwd(inputs: list):
input1, input2, weights, dropout_p = inputs
x = input2 + torch.nn.functional.dropout(input1, p=dropout_p)
return weights * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5)


def dropout_rmsnorm_fwd_iobytes(size: tuple, dtype: torch.dtype):
# Manual IOByte computation is required since nvFuser input/outputs differ from baseline outputs (output).
nvf_inp_out = {
Expand Down Expand Up @@ -165,15 +161,12 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
dropout_p,
]

benchmark_fn = {
"eager": dropout_rmsnorm_fwd,
"torchcompile": torch.compile(dropout_rmsnorm_fwd),
}
benchmark_fn = with_executor(executor, dropout_rmsnorm)

# Manually compute IOBytes: See PR #1725
run_benchmark(
benchmark,
benchmark_fn[executor],
benchmark_fn,
inputs,
iobytes=dropout_rmsnorm_fwd_iobytes(size, dtype),
)
14 changes: 5 additions & 9 deletions benchmarks/python/test_gelu_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
from .torch_ops import gelu


def gelu_bwd_fusion(
Expand Down Expand Up @@ -103,14 +104,9 @@ def test_gelu_bwd_baseline_benchmark(
bias = torch.ones(size[-1], device="cuda", dtype=dtype)
grads = torch.randn(size, device="cuda", dtype=dtype)

def gelu_fwd():
return torch.nn.functional.gelu(inputs + bias, approximate="tanh")

fwd_fn = {
"eager": gelu_fwd,
"torchcompile": torch.compile(gelu_fwd),
}
outputs = fwd_fn[executor]()
fwd_fn = with_executor(executor, gelu)
fwd_inputs = [inputs, bias]
outputs = fwd_fn(fwd_inputs)

run_benchmark(
benchmark,
Expand Down
9 changes: 3 additions & 6 deletions benchmarks/python/test_gelu_bwd_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -121,14 +121,11 @@ def test_gelu_bwd_reduction_baseline_benchmark(
grads = torch.randn(size, device="cuda", dtype=dtype)
eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh")

benchmark_fn = {
"eager": gelu_bwd_reduction_torch,
"torchcompile": torch.compile(gelu_bwd_reduction_torch),
}
benchmark_fn = with_executor(executor, gelu_bwd_reduction_torch)

run_benchmark(
benchmark,
benchmark_fn[executor],
benchmark_fn,
[eager_output, grads, inputs, reduction_axis],
iobytes=gelu_bwd_reduction_iobytes(size, dtype, reduction_axis),
)
16 changes: 5 additions & 11 deletions benchmarks/python/test_gelu_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import gelu


def gelu_fwd_fusion(
Expand Down Expand Up @@ -41,10 +42,6 @@ def gelu_fwd_fusion(
fd.add_output(T10)


def gelu_fwd_fn(inputs: list): # [in_tensor, bias]
return torch.nn.functional.gelu(inputs[0] + inputs[1], approximate="tanh")


@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_gelu_fwd_nvf_benchmark(
Expand All @@ -61,7 +58,7 @@ def test_gelu_fwd_nvf_benchmark(
with FusionDefinition() as fd:
gelu_fwd_fusion(fd, torch_dtype_to_nvfuser_dtype(dtype))
if not disable_validation:
eager_output = gelu_fwd_fn(inputs)
eager_output = gelu(inputs)
fd.validate(inputs, [eager_output])
if not disable_benchmarking:
run_benchmark(benchmark, fd.execute, inputs)
Expand All @@ -83,10 +80,7 @@ def test_gelu_fwd_baseline_benchmark(
torch.ones(size[-1], device="cuda", dtype=dtype), # bias
]

benchmark_fn = {
"eager": gelu_fwd_fn,
"torchcompile": torch.compile(gelu_fwd_fn),
}
benchmark_fn = with_executor(executor, gelu)

# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(benchmark, benchmark_fn[executor], inputs)
run_benchmark(benchmark, benchmark_fn, inputs)
15 changes: 4 additions & 11 deletions benchmarks/python/test_groupnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
import thunder
from thunder.executors.nvfuserex import nvfuserex
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES


Expand Down Expand Up @@ -145,15 +143,10 @@ def test_groupnorm_fwd_baseline_benchmark(
bias = torch.randn(C, device="cuda", dtype=dtype)
num_groups = get_n_groups(C)

benchmark_fn = {
"eager": groupnorm_fwd,
"torchcompile": torch.compile(groupnorm_fwd),
"thunder": thunder.jit(
groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
),
}
benchmark_fn = with_executor(executor, groupnorm_fwd)

run_benchmark(
benchmark,
benchmark_fn[executor],
benchmark_fn,
[x, weight, bias, num_groups],
)
Loading

0 comments on commit 498b5f9

Please sign in to comment.