diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py index 03adbe1e7dd..04ba8ac13f2 100644 --- a/benchmarks/python/conftest.py +++ b/benchmarks/python/conftest.py @@ -2,7 +2,7 @@ # All rights reserved. # SPDX-License-Identifier: BSD-3-Clause import pytest -from .core import BENCHMARK_CONFIG +from .core import BENCHMARK_CONFIG, DEFAULT_EXECUTORS from nvfuser.pytorch_utils import DEVICE_PROPERTIES @@ -104,20 +104,18 @@ def pytest_collection_modifyitems(session, config, items): from nvfuser.pytorch_utils import retry_on_oom_or_skip_test - executors = ["eager", "torchcompile", "thunder"] - def get_test_executor(item) -> str | None: if hasattr(item, "callspec") and "executor" in item.callspec.params: test_executor = item.callspec.params["executor"] assert ( - test_executor in executors + test_executor in DEFAULT_EXECUTORS ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}." return test_executor return None executors_to_skip = [] - for executor in executors: + for executor in DEFAULT_EXECUTORS: if not config.getoption(f"--benchmark-{executor}"): executors_to_skip.append(executor) diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py index d7b90033f56..1ea84fdfc59 100644 --- a/benchmarks/python/core.py +++ b/benchmarks/python/core.py @@ -13,7 +13,6 @@ import thunder from thunder.executors.nvfuserex import nvfuserex - # These variables can be overwritten through CLI commands # --benchmark-rounds=rounds --benchmark-warmup-rounds=warmup_rounds # --benchmark-num-inputs=num_inputs @@ -22,6 +21,9 @@ L2_CACHE_SIZE = DEVICE_PROPERTIES["gpu_l2_bytes"] PEAK_BANDWIDTH_GBPS = DEVICE_PROPERTIES["gpu_peak_bandwidth_gbps"] +# Default executors +DEFAULT_EXECUTORS = ["eager", "torchcompile", "thunder"] + def clear_l2_cache() -> None: """ @@ -48,7 +50,7 @@ def unary_bwd_torch(inputs: List): # [output, grad_out] def with_executor(executor: str, fwd_fn: Callable) -> Callable: - assert executor in ["eager", "torchcompile", "thunder"] + assert executor in DEFAULT_EXECUTORS if executor == "eager": return fwd_fn if executor == "torchcompile": @@ -152,7 +154,6 @@ def torchprofile_timer(self) -> float: # Clear the internal profiler object to avoid accumulating function events and then restart the profiler # See PR: https://github.com/pytorch/pytorch/pull/125510 self.prof.profiler = None - self.prof.start() return self.current_time @@ -325,6 +326,9 @@ def run_benchmark( def setup(): clear_l2_cache() if device == "cuda": + for inp in inputs: + if isinstance(inp, torch.Tensor): + inp.grad = None return [inputs], {} # Device = 'host' diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py index a4f72242f4f..0da1e95ffe8 100644 --- a/benchmarks/python/normalization.py +++ b/benchmarks/python/normalization.py @@ -501,6 +501,6 @@ def norm_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=norm_bwd_iobytes(size, dtype, norm), ) diff --git a/benchmarks/python/test_batchnorm_bwd.py b/benchmarks/python/test_batchnorm_bwd.py index 0a1cd64cc57..25c49dd743e 100644 --- a/benchmarks/python/test_batchnorm_bwd.py +++ b/benchmarks/python/test_batchnorm_bwd.py @@ -5,6 +5,7 @@ import torch from .global_params import generate_input_sizes, FLOAT_DTYPES from .normalization import norm_bwd_nvf_benchmark, norm_bwd_baseline_benchmark +from .core import DEFAULT_EXECUTORS @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @@ -31,7 +32,7 @@ def test_batchnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) diff --git a/benchmarks/python/test_batchnorm_fwd.py b/benchmarks/python/test_batchnorm_fwd.py index af197ce6f1b..b146c0fce62 100644 --- a/benchmarks/python/test_batchnorm_fwd.py +++ b/benchmarks/python/test_batchnorm_fwd.py @@ -5,6 +5,7 @@ import torch from .global_params import generate_input_sizes, FLOAT_DTYPES from .normalization import norm_fwd_nvf_benchmark, norm_fwd_baseline_benchmark +from .core import DEFAULT_EXECUTORS @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @@ -31,7 +32,7 @@ def test_batchnorm_fwd_nvf_benchmark( ) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py index dcedb5a0ded..084590762ee 100644 --- a/benchmarks/python/test_broadcast_add_fwd.py +++ b/benchmarks/python/test_broadcast_add_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, x]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"]) diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py index 71b54f3f86b..ff5de768ba5 100644 --- a/benchmarks/python/test_dropout_layernorm_bwd.py +++ b/benchmarks/python/test_dropout_layernorm_bwd.py @@ -10,6 +10,7 @@ unary_bwd_torch, compute_total_iobytes, with_executor, + DEFAULT_EXECUTORS, ) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -191,7 +192,7 @@ def test_dropout_layernorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_bwd_baseline_benchmark( @@ -219,6 +220,6 @@ def test_dropout_layernorm_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=dropout_layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py index 78eec0cedf2..c9930ae1b24 100644 --- a/benchmarks/python/test_dropout_layernorm_fwd.py +++ b/benchmarks/python/test_dropout_layernorm_fwd.py @@ -9,6 +9,7 @@ clear_dynamo_cache, compute_total_iobytes, with_executor, + DEFAULT_EXECUTORS, ) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -151,7 +152,7 @@ def test_dropout_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py index 275a56e0731..5182a9f4e6a 100644 --- a/benchmarks/python/test_dropout_rmsnorm_bwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_bwd.py @@ -10,6 +10,7 @@ unary_bwd_torch, compute_total_iobytes, with_executor, + DEFAULT_EXECUTORS, ) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -171,7 +172,7 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_bwd_baseline_benchmark( @@ -195,6 +196,6 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=dropout_rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py index 16b5b72f7a1..dfde53d66e1 100644 --- a/benchmarks/python/test_dropout_rmsnorm_fwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_fwd.py @@ -9,6 +9,7 @@ clear_dynamo_cache, compute_total_iobytes, with_executor, + DEFAULT_EXECUTORS, ) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -141,7 +142,7 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2, weights]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py index 27a6107cb3d..04b6f705255 100644 --- a/benchmarks/python/test_gelu_bwd.py +++ b/benchmarks/python/test_gelu_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -89,7 +95,7 @@ def test_gelu_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, bias]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_bwd_baseline_benchmark( @@ -111,6 +117,6 @@ def test_gelu_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=gelu_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py index 3f56b857fa2..b5eb153d395 100644 --- a/benchmarks/python/test_gelu_fwd.py +++ b/benchmarks/python/test_gelu_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES from .torch_ops import gelu @@ -64,7 +64,7 @@ def test_gelu_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py index 5e47179ecf3..7139e4f8d7b 100644 --- a/benchmarks/python/test_groupnorm_fwd.py +++ b/benchmarks/python/test_groupnorm_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -126,7 +126,7 @@ def test_groupnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, weight, bias]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_groupnorm_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py index ddfd33565d9..258d98d1a37 100644 --- a/benchmarks/python/test_huggingface_attn_bwd.py +++ b/benchmarks/python/test_huggingface_attn_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES from .torch_ops import huggingface_attn @@ -108,7 +114,7 @@ def test_huggingface_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_bwd_baseline_benchmark( @@ -138,6 +144,6 @@ def test_huggingface_attn_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=huggingface_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py index 624deb2354a..c6271c3bbb1 100644 --- a/benchmarks/python/test_huggingface_attn_fwd.py +++ b/benchmarks/python/test_huggingface_attn_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES from .torch_ops import huggingface_attn @@ -125,7 +125,7 @@ def test_huggingface_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [attention_mask, inputs]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py index 27e4c5a4b9f..08be8e6a7b6 100644 --- a/benchmarks/python/test_layernorm_bwd.py +++ b/benchmarks/python/test_layernorm_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -147,7 +153,7 @@ def test_layernorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_bwd_baseline_benchmark( @@ -172,6 +178,6 @@ def test_layernorm_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py index 1655e639f28..e833c4360ef 100644 --- a/benchmarks/python/test_layernorm_fwd.py +++ b/benchmarks/python/test_layernorm_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -98,7 +98,7 @@ def test_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py index 8d1feeb446e..54d19fd7093 100644 --- a/benchmarks/python/test_nanogpt_attn_bwd.py +++ b/benchmarks/python/test_nanogpt_attn_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES from .torch_ops import nanogpt_attn @@ -125,7 +131,7 @@ def test_nanogpt_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_bwd_baseline_benchmark( @@ -156,6 +162,6 @@ def test_nanogpt_attn_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=nanogpt_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py index ae55b7b3f7f..70947337e4f 100644 --- a/benchmarks/python/test_nanogpt_attn_fwd.py +++ b/benchmarks/python/test_nanogpt_attn_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES from .torch_ops import nanogpt_attn @@ -127,7 +127,7 @@ def test_nanogpt_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, bias]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py index 3f552ff0026..6e73dbf876d 100644 --- a/benchmarks/python/test_pointwise_mul.py +++ b/benchmarks/python/test_pointwise_mul.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -50,7 +50,7 @@ def test_pointwise_mul_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_pointwise_mul_baseline_benchmark( diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py index d24848c8f12..6c93cd75088 100644 --- a/benchmarks/python/test_reduction.py +++ b/benchmarks/python/test_reduction.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py index ce5e1961d63..7e855c0fd18 100644 --- a/benchmarks/python/test_reduction_epilogue.py +++ b/benchmarks/python/test_reduction_epilogue.py @@ -5,7 +5,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, epilogue]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0]) diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py index 14832521a24..3f7a15fa1a1 100644 --- a/benchmarks/python/test_rmsnorm_bwd.py +++ b/benchmarks/python/test_rmsnorm_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -113,7 +119,7 @@ def test_rmsnorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, rms_eps, grads, weights]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_bwd_baseline_benchmark( @@ -137,6 +143,6 @@ def test_rmsnorm_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py index bb19ea151b7..fea92532034 100644 --- a/benchmarks/python/test_rmsnorm_fwd.py +++ b/benchmarks/python/test_rmsnorm_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -81,7 +81,7 @@ def test_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, weights]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py index 7421a89dfaa..219247fc8e7 100644 --- a/benchmarks/python/test_scale_bias_relu_bwd.py +++ b/benchmarks/python/test_scale_bias_relu_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -80,7 +86,7 @@ def test_sbr_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [scale, bool_mask, grads]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_bwd_baseline_benchmark( @@ -104,6 +110,6 @@ def test_sbr_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=sbr_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py index a7231667a01..06125881595 100644 --- a/benchmarks/python/test_scale_bias_relu_fwd.py +++ b/benchmarks/python/test_scale_bias_relu_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -79,7 +79,7 @@ def test_sbr_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, scale, inputs]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py index fe24989e29e..b5aaa805161 100644 --- a/benchmarks/python/test_silu_mul_bwd.py +++ b/benchmarks/python/test_silu_mul_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -80,7 +86,7 @@ def test_silu_mul_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, x, y]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_bwd_baseline_benchmark( @@ -103,6 +109,6 @@ def test_silu_mul_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=silu_mul_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py index 31eafe957f9..571d7e59315 100644 --- a/benchmarks/python/test_silu_mul_fwd.py +++ b/benchmarks/python/test_silu_mul_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES from .torch_ops import silu_mul @@ -53,7 +53,7 @@ def test_silu_mul_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_fwd_baseline_benchmark( diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py index e0d24b4176b..685f9ad4b3e 100644 --- a/benchmarks/python/test_softmax_bwd.py +++ b/benchmarks/python/test_softmax_bwd.py @@ -4,7 +4,13 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor +from .core import ( + run_benchmark, + clear_dynamo_cache, + unary_bwd_torch, + with_executor, + DEFAULT_EXECUTORS, +) import torch from .global_params import generate_input_sizes, FLOAT_DTYPES import numpy as np @@ -92,7 +98,7 @@ def test_softmax_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -115,6 +121,6 @@ def test_softmax_bwd_baseline_benchmark( run_benchmark( benchmark, unary_bwd_torch, - [outputs, grads], + [outputs, grads, *fwd_inputs], iobytes=softmax_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py index f1ff881b1f6..dd2216cf34a 100644 --- a/benchmarks/python/test_softmax_fwd.py +++ b/benchmarks/python/test_softmax_fwd.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES import numpy as np @@ -78,7 +78,7 @@ def test_softmax_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index a192e0b72a3..07d9264484d 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -4,7 +4,7 @@ import pytest from nvfuser import FusionDefinition, DataType from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype -from .core import run_benchmark, clear_dynamo_cache, with_executor +from .core import run_benchmark, clear_dynamo_cache, with_executor, DEFAULT_EXECUTORS import torch from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES @@ -74,7 +74,7 @@ def test_transpose_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2]) -@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) +@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) @pytest.mark.parametrize("size", generate_input_sizes(dims=3)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)])