Skip to content

Commit

Permalink
add import
Browse files Browse the repository at this point in the history
  • Loading branch information
Priya2698 committed Dec 9, 2024
1 parent 4da1966 commit 15a9f50
Show file tree
Hide file tree
Showing 33 changed files with 62 additions and 59 deletions.
8 changes: 3 additions & 5 deletions benchmarks/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from .core import BENCHMARK_CONFIG
from nvfuser.pytorch_utils import DEVICE_PROPERTIES

from .global_params import DEFAULT_EXECUTORS

def pytest_addoption(parser):
parser.addoption(
Expand Down Expand Up @@ -104,20 +104,18 @@ def pytest_collection_modifyitems(session, config, items):

from nvfuser.pytorch_utils import retry_on_oom_or_skip_test

executors = ["eager", "torchcompile", "thunder"]

def get_test_executor(item) -> str | None:
if hasattr(item, "callspec") and "executor" in item.callspec.params:
test_executor = item.callspec.params["executor"]
assert (
test_executor in executors
test_executor in DEFAULT_EXECUTORS
), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}."
return test_executor
return None

executors_to_skip = []

for executor in executors:
for executor in DEFAULT_EXECUTORS:
if not config.getoption(f"--benchmark-{executor}"):
executors_to_skip.append(executor)

Expand Down
7 changes: 5 additions & 2 deletions benchmarks/python/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import warnings
import thunder
from thunder.executors.nvfuserex import nvfuserex

from .global_params import DEFAULT_EXECUTORS

# These variables can be overwritten through CLI commands
# --benchmark-rounds=rounds --benchmark-warmup-rounds=warmup_rounds
Expand Down Expand Up @@ -47,7 +47,7 @@ def unary_bwd_torch(inputs: List): # [output, grad_out]
inputs[0].backward(inputs[1], retain_graph=True)

def with_executor(executor: str, fwd_fn: Callable) -> Callable:
assert executor in ["eager", "torchcompile", "thunder"]
assert executor in DEFAULT_EXECUTORS
if executor == 'eager':
return fwd_fn
if executor == 'torchcompile':
Expand Down Expand Up @@ -335,6 +335,9 @@ def run_benchmark(
def setup():
clear_l2_cache()
if device == "cuda":
for inp in inputs:
if isinstance(inp, torch.Tensor):
inp.grad = None
return [inputs], {}

# Device = 'host'
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/python/global_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
# Datatypes that will be promoted to Datatype.Float in Fusion Definitions
PROMOTE_DTYPES = [DataType.BFloat16, DataType.Half]

#Default executors
DEFAULT_EXECUTORS = ["eager", "torchcompile", "thunder"]
# Model Parameters from LLMs (GPT2/3, PaLM, LLama)

# Embedding size: d_model, d_ff = 4 * d_model
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,6 @@ def norm_bwd_baseline_benchmark(
run_benchmark(
benchmark,
unary_bwd_torch,
[outputs, grads],
[outputs, grads, *fwd_inputs],
iobytes=norm_bwd_iobytes(size, dtype, norm),
)
4 changes: 2 additions & 2 deletions benchmarks/python/test_batchnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, DEFAULT_EXECUTORS
from .normalization import norm_bwd_nvf_benchmark, norm_bwd_baseline_benchmark


Expand Down Expand Up @@ -31,7 +31,7 @@ def test_batchnorm_bwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("channels_last", [True, False])
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_batchnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, DEFAULT_EXECUTORS
from .normalization import norm_fwd_nvf_benchmark, norm_fwd_baseline_benchmark


Expand Down Expand Up @@ -31,7 +31,7 @@ def test_batchnorm_fwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("channels_last", [True, False])
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_broadcast_add_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS


def bcast_add_fusion(
Expand Down Expand Up @@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [bias, x])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"])
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_dropout_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
from .torch_ops import dropout_layernorm


Expand Down Expand Up @@ -191,7 +191,7 @@ def test_dropout_layernorm_bwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_layernorm_bwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_dropout_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
from .torch_ops import dropout_layernorm


Expand Down Expand Up @@ -151,7 +151,7 @@ def test_dropout_layernorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_layernorm_fwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_dropout_rmsnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
from .torch_ops import dropout_rmsnorm


Expand Down Expand Up @@ -171,7 +171,7 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark(
)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_rmsnorm_bwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_dropout_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
with_executor,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
from .torch_ops import dropout_rmsnorm


Expand Down Expand Up @@ -141,7 +141,7 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [input1, input2, weights])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_dropout_rmsnorm_fwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_gelu_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
import numpy as np
from .torch_ops import gelu

Expand Down Expand Up @@ -89,7 +89,7 @@ def test_gelu_bwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [inputs, grads, bias])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_gelu_bwd_baseline_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_gelu_bwd_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
import numpy as np


Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_gelu_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
from .torch_ops import gelu


Expand Down Expand Up @@ -64,7 +64,7 @@ def test_gelu_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_gelu_fwd_baseline_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_groupnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_groupnorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [x, weight, bias])


@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_groupnorm_fwd_baseline_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_huggingface_attn_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_huggingface_attn_bwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_attn_inputs())
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_huggingface_attn_bwd_baseline_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_huggingface_attn_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_huggingface_attn_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [attention_mask, inputs])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_attn_inputs())
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_huggingface_attn_fwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
import numpy as np
from .torch_ops import layernorm

Expand Down Expand Up @@ -147,7 +147,7 @@ def test_layernorm_bwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_layernorm_bwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
import numpy as np
from .torch_ops import layernorm

Expand Down Expand Up @@ -98,7 +98,7 @@ def test_layernorm_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_layernorm_fwd_baseline_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_nanogpt_attn_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_nanogpt_attn_bwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_attn_inputs())
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_nanogpt_attn_bwd_baseline_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_nanogpt_attn_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def test_nanogpt_attn_fwd_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [inputs, bias])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_attn_inputs())
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_nanogpt_attn_fwd_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_pointwise_mul.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS


def pointwise_mul_fusion(
Expand Down Expand Up @@ -50,7 +50,7 @@ def test_pointwise_mul_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
def test_pointwise_mul_baseline_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS


def reduction_fusion(
Expand Down Expand Up @@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark(
run_benchmark(benchmark, fd.execute, inputs)


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("reduction_axis", [0, 1])
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_reduction_epilogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, with_executor
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS

# test the influence of epilogue on the performance of reduction.
# current reduction scheduler only allows epilogue to be fused with outer reduction without post reduction broadcast.
Expand Down Expand Up @@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark(
run_benchmark(benchmark, fd.execute, [x, epilogue])


@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
@pytest.mark.parametrize("reduction_axis", [0])
Expand Down
Loading

0 comments on commit 15a9f50

Please sign in to comment.