Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add thunder benchmarks #3394

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions benchmarks/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from .core import BENCHMARK_CONFIG
from .core import BENCHMARK_CONFIG, DEFAULT_EXECUTORS
from nvfuser.pytorch_utils import DEVICE_PROPERTIES


Expand Down Expand Up @@ -104,20 +104,18 @@ def pytest_collection_modifyitems(session, config, items):

from nvfuser.pytorch_utils import retry_on_oom_or_skip_test

executors = ["eager", "torchcompile", "thunder"]

def get_test_executor(item) -> str | None:
if hasattr(item, "callspec") and "executor" in item.callspec.params:
test_executor = item.callspec.params["executor"]
assert (
test_executor in executors
test_executor in DEFAULT_EXECUTORS
), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}."
return test_executor
return None

executors_to_skip = []

for executor in executors:
for executor in DEFAULT_EXECUTORS:
if not config.getoption(f"--benchmark-{executor}"):
executors_to_skip.append(executor)

Expand Down
9 changes: 7 additions & 2 deletions benchmarks/python/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import thunder
from thunder.executors.nvfuserex import nvfuserex


# These variables can be overwritten through CLI commands
# --benchmark-rounds=rounds --benchmark-warmup-rounds=warmup_rounds
# --benchmark-num-inputs=num_inputs
Expand All @@ -22,6 +21,9 @@
L2_CACHE_SIZE = DEVICE_PROPERTIES["gpu_l2_bytes"]
PEAK_BANDWIDTH_GBPS = DEVICE_PROPERTIES["gpu_peak_bandwidth_gbps"]

# Default executors
DEFAULT_EXECUTORS = ["eager", "torchcompile", "thunder"]


def clear_l2_cache() -> None:
"""
Expand All @@ -48,7 +50,7 @@ def unary_bwd_torch(inputs: List): # [output, grad_out]


def with_executor(executor: str, fwd_fn: Callable) -> Callable:
assert executor in ["eager", "torchcompile", "thunder"]
assert executor in DEFAULT_EXECUTORS
if executor == "eager":
return fwd_fn
if executor == "torchcompile":
Expand Down Expand Up @@ -325,6 +327,9 @@ def run_benchmark(
def setup():
clear_l2_cache()
if device == "cuda":
for inp in inputs:
if isinstance(inp, torch.Tensor):
inp.grad = None
return [inputs], {}

# Device = 'host'
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,6 @@ def norm_bwd_baseline_benchmark(
run_benchmark(
benchmark,
unary_bwd_torch,
[outputs, grads],
[outputs, grads, *fwd_inputs],
iobytes=norm_bwd_iobytes(size, dtype, norm),
)
3 changes: 2 additions & 1 deletion benchmarks/python/test_batchnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES
from .normalization import norm_bwd_nvf_benchmark, norm_bwd_baseline_benchmark
from .core import DEFAULT_EXECUTORS


@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
Expand All @@ -31,7 +32,7 @@ def test_batchnorm_bwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("channels_last", [True, False])
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/python/test_batchnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES
from .normalization import norm_fwd_nvf_benchmark, norm_fwd_baseline_benchmark
from .core import DEFAULT_EXECUTORS


@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
Expand All @@ -31,7 +32,7 @@ def test_batchnorm_fwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("channels_last", [True, False])
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_broadcast_add_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [bias, x])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"])
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_dropout_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
unary_bwd_torch,
compute_total_iobytes,
with_executor,
DEFAULT_EXECUTORS,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
Expand Down Expand Up @@ -191,7 +192,7 @@ def test_dropout_layernorm_bwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_layernorm_bwd_baseline_benchmark(
Expand Down Expand Up @@ -219,6 +220,6 @@ def test_dropout_layernorm_bwd_baseline_benchmark(
run_benchmark(
benchmark,
unary_bwd_torch,
[outputs, grads],
[outputs, grads, *fwd_inputs],
iobytes=dropout_layernorm_bwd_iobytes(size, dtype),
)
3 changes: 2 additions & 1 deletion benchmarks/python/test_dropout_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
clear_dynamo_cache,
compute_total_iobytes,
with_executor,
DEFAULT_EXECUTORS,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
Expand Down Expand Up @@ -151,7 +152,7 @@ def test_dropout_layernorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_layernorm_fwd_baseline_benchmark(
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_dropout_rmsnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
unary_bwd_torch,
compute_total_iobytes,
with_executor,
DEFAULT_EXECUTORS,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
Expand Down Expand Up @@ -171,7 +172,7 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_rmsnorm_bwd_baseline_benchmark(
Expand All @@ -195,6 +196,6 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark(
run_benchmark(
benchmark,
unary_bwd_torch,
[outputs, grads],
[outputs, grads, *fwd_inputs],
iobytes=dropout_rmsnorm_bwd_iobytes(size, dtype),
)
3 changes: 2 additions & 1 deletion benchmarks/python/test_dropout_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
clear_dynamo_cache,
compute_total_iobytes,
with_executor,
DEFAULT_EXECUTORS,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
Expand Down Expand Up @@ -141,7 +142,7 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [input1, input2, weights])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_rmsnorm_fwd_baseline_benchmark(
Expand Down
12 changes: 9 additions & 3 deletions benchmarks/python/test_gelu_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
from .core import (
run_benchmark,
clear_dynamo_cache,
unary_bwd_torch,
with_executor,
DEFAULT_EXECUTORS,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -89,7 +95,7 @@ def test_gelu_bwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [inputs, grads, bias])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_gelu_bwd_baseline_benchmark(
Expand All @@ -111,6 +117,6 @@ def test_gelu_bwd_baseline_benchmark(
run_benchmark(
benchmark,
unary_bwd_torch,
[outputs, grads],
[outputs, grads, *fwd_inputs],
iobytes=gelu_bwd_iobytes(size, dtype),
)
4 changes: 2 additions & 2 deletions benchmarks/python/test_gelu_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import gelu
Expand Down Expand Up @@ -64,7 +64,7 @@ def test_gelu_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_gelu_fwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_groupnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -126,7 +126,7 @@ def test_groupnorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [x, weight, bias])


@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_groupnorm_fwd_baseline_benchmark(
Expand Down
12 changes: 9 additions & 3 deletions benchmarks/python/test_huggingface_attn_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
from .core import (
run_benchmark,
clear_dynamo_cache,
unary_bwd_torch,
with_executor,
DEFAULT_EXECUTORS,
)
import torch
from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import huggingface_attn
Expand Down Expand Up @@ -108,7 +114,7 @@ def test_huggingface_attn_bwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_attn_inputs())
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_huggingface_attn_bwd_baseline_benchmark(
Expand Down Expand Up @@ -138,6 +144,6 @@ def test_huggingface_attn_bwd_baseline_benchmark(
run_benchmark(
benchmark,
unary_bwd_torch,
[outputs, grads],
[outputs, grads, *fwd_inputs],
iobytes=huggingface_attn_bwd_iobytes(size, dtype),
)
4 changes: 2 additions & 2 deletions benchmarks/python/test_huggingface_attn_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS
import torch
from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES
from .torch_ops import huggingface_attn
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_huggingface_attn_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [attention_mask, inputs])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_attn_inputs())
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_huggingface_attn_fwd_baseline_benchmark(
Expand Down
12 changes: 9 additions & 3 deletions benchmarks/python/test_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
from .core import (
run_benchmark,
clear_dynamo_cache,
unary_bwd_torch,
with_executor,
DEFAULT_EXECUTORS,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -147,7 +153,7 @@ def test_layernorm_bwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_layernorm_bwd_baseline_benchmark(
Expand All @@ -172,6 +178,6 @@ def test_layernorm_bwd_baseline_benchmark(
run_benchmark(
benchmark,
unary_bwd_torch,
[outputs, grads],
[outputs, grads, *fwd_inputs],
iobytes=layernorm_bwd_iobytes(size, dtype),
)
4 changes: 2 additions & 2 deletions benchmarks/python/test_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -98,7 +98,7 @@ def test_layernorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_layernorm_fwd_baseline_benchmark(
Expand Down
Loading
Loading