diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py index 8932afbff30..03adbe1e7dd 100644 --- a/benchmarks/python/conftest.py +++ b/benchmarks/python/conftest.py @@ -96,45 +96,39 @@ def pytest_configure(config): def pytest_collection_modifyitems(session, config, items): """ - The baseline benchmarks use `compile` parameter: - compile = false: Eager mode benchmark - compile = true: torch.compile benchmark + The baseline benchmarks use `executor` parameter with + values ["eager", "torchcompile", "thunder"] that are optionally + run using `--benchmark-{executor}` flag. They are skipped by + default. """ - run_eager = config.getoption("--benchmark-eager") - run_thunder = config.getoption("--benchmark-thunder") - run_torchcompile = config.getoption("--benchmark-torchcompile") from nvfuser.pytorch_utils import retry_on_oom_or_skip_test + executors = ["eager", "torchcompile", "thunder"] + + def get_test_executor(item) -> str | None: + if hasattr(item, "callspec") and "executor" in item.callspec.params: + test_executor = item.callspec.params["executor"] + assert ( + test_executor in executors + ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}." + return test_executor + return None + + executors_to_skip = [] + + for executor in executors: + if not config.getoption(f"--benchmark-{executor}"): + executors_to_skip.append(executor) + for item in items: item.obj = retry_on_oom_or_skip_test(item.obj) - if not run_eager: - skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run") - for item in items: - # If the benchmark has compile=False parameter (eager mode), skip it. - if ( - hasattr(item, "callspec") - and "compile" in item.callspec.params - and not item.callspec.params["compile"] - ): - item.add_marker(skip_eager) - - if not run_torchcompile: - skip_torchcompile = pytest.mark.skip( - reason="need --benchmark-torchcompile option to run" - ) - for item in items: - # If the benchmark has compile=True parameter (torch.compile mode), skip it. - if ( - hasattr(item, "callspec") - and "compile" in item.callspec.params - and item.callspec.params["compile"] - ): - item.add_marker(skip_torchcompile) - - if not run_thunder: - skip_thunder = pytest.mark.skip(reason="need --benchmark-thunder option to run") - for item in items: - if "thunder" in item.nodeid: - item.add_marker(skip_thunder) + test_executor = get_test_executor(item) + + if test_executor is not None and test_executor in executors_to_skip: + item.add_marker( + pytest.mark.skip( + reason=f"need --benchmark-{test_executor} option to run." + ) + ) diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py index 8cbafe81353..6d493338846 100644 --- a/benchmarks/python/normalization.py +++ b/benchmarks/python/normalization.py @@ -433,10 +433,10 @@ def norm_fwd_baseline_benchmark( size: tuple, dtype: torch.dtype, channels_last: bool, - compile: bool, + executor: str, norm: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() assert norm in ["batch_norm", "instance_norm"], NotImplementedError @@ -453,10 +453,12 @@ def norm_fwd_baseline_benchmark( norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn + benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)} + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(norm_fwd_fn) if compile else norm_fwd_fn, + benchmark_fn[executor], [inputs, weight, bias, running_mean, running_var], iobytes=norm_fwd_iobytes(size, dtype, norm), ) @@ -467,10 +469,10 @@ def norm_bwd_baseline_benchmark( size: tuple, dtype: torch.dtype, channels_last: bool, - compile: bool, + executor: str, norm: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() assert norm in ["batch_norm", "instance_norm"], NotImplementedError @@ -491,13 +493,13 @@ def norm_bwd_baseline_benchmark( norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn # Compile the fwd fn for torchcompile - norm_fwd_fn = torch.compile(norm_fwd_fn) if compile else norm_fwd_fn - output = norm_fwd_fn([inputs, weight, bias, running_mean, running_var]) + fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)} + outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var]) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=norm_bwd_iobytes(size, dtype, norm), ) diff --git a/benchmarks/python/test_batchnorm_bwd.py b/benchmarks/python/test_batchnorm_bwd.py index 74242ba99e2..0a1cd64cc57 100644 --- a/benchmarks/python/test_batchnorm_bwd.py +++ b/benchmarks/python/test_batchnorm_bwd.py @@ -31,13 +31,13 @@ def test_batchnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_batchnorm_bwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_bwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "batch_norm" + benchmark, size, dtype, channels_last, executor, "batch_norm" ) diff --git a/benchmarks/python/test_batchnorm_fwd.py b/benchmarks/python/test_batchnorm_fwd.py index 47b3997770a..af197ce6f1b 100644 --- a/benchmarks/python/test_batchnorm_fwd.py +++ b/benchmarks/python/test_batchnorm_fwd.py @@ -31,13 +31,13 @@ def test_batchnorm_fwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_batchnorm_fwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_fwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "batch_norm" + benchmark, size, dtype, channels_last, executor, "batch_norm" ) diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py index abb320ef2a3..65db1555b28 100644 --- a/benchmarks/python/test_broadcast_add_fwd.py +++ b/benchmarks/python/test_broadcast_add_fwd.py @@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, x]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"]) @@ -101,9 +101,9 @@ def test_bcast_add_baseline_benchmark( dtype: torch.dtype, bcast_axis: int, contiguous: bool, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda") input_shape = size if contiguous else (size[1], size[0]) @@ -112,9 +112,14 @@ def test_bcast_add_baseline_benchmark( x = x.t() assert x.is_contiguous() == contiguous + benchmark_fn = { + "eager": bcast_add_fwd_fn, + "torchcompile": torch.compile(bcast_add_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(bcast_add_fwd_fn) if compile else bcast_add_fwd_fn, + benchmark_fn[executor], [bias, x, bcast_axis], ) diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py index 6acaa012c5c..380a2085b09 100644 --- a/benchmarks/python/test_dropout_layernorm_bwd.py +++ b/benchmarks/python/test_dropout_layernorm_bwd.py @@ -189,16 +189,16 @@ def test_dropout_layernorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -217,13 +217,16 @@ def dropout_layernorm_fwd(): ) # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": dropout_layernorm_fwd, + "torchcompile": torch.compile(dropout_layernorm_fwd), + } + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=dropout_layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py index 47854fcd2d7..4408a2bd611 100644 --- a/benchmarks/python/test_dropout_layernorm_fwd.py +++ b/benchmarks/python/test_dropout_layernorm_fwd.py @@ -160,16 +160,16 @@ def test_dropout_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -181,10 +181,15 @@ def test_dropout_layernorm_fwd_baseline_benchmark( dropout_p, ] + benchmark_fn = { + "eager": dropout_layernorm_fwd, + "torchcompile": torch.compile(dropout_layernorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd, + benchmark_fn[executor], inputs, iobytes=dropout_layernorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py index 8c61c51e2d9..d196e76f57b 100644 --- a/benchmarks/python/test_dropout_rmsnorm_bwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_bwd.py @@ -169,16 +169,16 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) @@ -191,12 +191,15 @@ def dropout_rmsnorm_fwd(): output = weights * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5) return output - fwd_fn = torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": dropout_rmsnorm_fwd, + "torchcompile": torch.compile(dropout_rmsnorm_fwd), + } + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=dropout_rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py index a93a8caf547..aea2674df9d 100644 --- a/benchmarks/python/test_dropout_rmsnorm_fwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_fwd.py @@ -145,16 +145,16 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -165,10 +165,15 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark( dropout_p, ] + benchmark_fn = { + "eager": dropout_rmsnorm_fwd, + "torchcompile": torch.compile(dropout_rmsnorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd, + benchmark_fn[executor], inputs, iobytes=dropout_rmsnorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py index 648f0317cf9..ffd0b25c6a2 100644 --- a/benchmarks/python/test_gelu_bwd.py +++ b/benchmarks/python/test_gelu_bwd.py @@ -88,16 +88,16 @@ def test_gelu_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) @@ -106,12 +106,15 @@ def test_gelu_bwd_baseline_benchmark( def gelu_fwd(): return torch.nn.functional.gelu(inputs + bias, approximate="tanh") - fwd_fn = torch.compile(gelu_fwd) if compile else gelu_fwd - eager_output = fwd_fn() + fwd_fn = { + "eager": gelu_fwd, + "torchcompile": torch.compile(gelu_fwd), + } + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=gelu_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py index 09dfd53d88a..e860826eb49 100644 --- a/benchmarks/python/test_gelu_bwd_reduction.py +++ b/benchmarks/python/test_gelu_bwd_reduction.py @@ -103,7 +103,7 @@ def test_gelu_bwd_reduction_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -112,19 +112,23 @@ def test_gelu_bwd_reduction_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) grads = torch.randn(size, device="cuda", dtype=dtype) eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh") + + benchmark_fn = { + "eager": gelu_bwd_reduction_torch, + "torchcompile": torch.compile(gelu_bwd_reduction_torch), + } + run_benchmark( benchmark, - torch.compile(gelu_bwd_reduction_torch) - if compile - else gelu_bwd_reduction_torch, + benchmark_fn[executor], [eager_output, grads, inputs, reduction_axis], iobytes=gelu_bwd_reduction_iobytes(size, dtype, reduction_axis), ) diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py index fa5f891ef8a..2f208b2c090 100644 --- a/benchmarks/python/test_gelu_fwd.py +++ b/benchmarks/python/test_gelu_fwd.py @@ -67,22 +67,26 @@ def test_gelu_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = [ torch.randn(size, device="cuda", dtype=dtype, requires_grad=True), # in_tensor torch.ones(size[-1], device="cuda", dtype=dtype), # bias ] + + benchmark_fn = { + "eager": gelu_fwd_fn, + "torchcompile": torch.compile(gelu_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation - run_benchmark( - benchmark, torch.compile(gelu_fwd_fn) if compile else gelu_fwd_fn, inputs - ) + run_benchmark(benchmark, benchmark_fn[executor], inputs) diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py index af4c023d7d7..8c729e115d7 100644 --- a/benchmarks/python/test_groupnorm_fwd.py +++ b/benchmarks/python/test_groupnorm_fwd.py @@ -128,35 +128,16 @@ def test_groupnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, weight, bias]) -@pytest.mark.parametrize("size", generate_input_sizes(dims=4)) -@pytest.mark.parametrize("dtype", FLOAT_DTYPES) -def test_groupnorm_fwd_thunder_benchmark( - benchmark, - size: tuple, - dtype: torch.dtype, -): - N, C, H, W = size - x = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) - weight = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True) - bias = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True) - num_groups = get_n_groups(C) - # thunder compiled model - groupnorm_fwd_jit = thunder.jit( - groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex] - ) - run_benchmark(benchmark, groupnorm_fwd_jit, [x, weight, bias, num_groups]) - - -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_groupnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() N, C, H, W = size x = torch.randn(size, device="cuda", dtype=dtype) @@ -164,8 +145,15 @@ def test_groupnorm_fwd_baseline_benchmark( bias = torch.randn(C, device="cuda", dtype=dtype) num_groups = get_n_groups(C) + benchmark_fn = { + "eager": groupnorm_fwd, + "torchcompile": torch.compile(groupnorm_fwd), + "thunder": thunder.jit( + groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex] + ), + } run_benchmark( benchmark, - torch.compile(groupnorm_fwd) if compile else groupnorm_fwd, + benchmark_fn[executor], [x, weight, bias, num_groups], ) diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py index dd8c9f80114..bcb2b4d9268 100644 --- a/benchmarks/python/test_huggingface_attn_bwd.py +++ b/benchmarks/python/test_huggingface_attn_bwd.py @@ -107,16 +107,16 @@ def test_huggingface_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -134,14 +134,17 @@ def huggingface_attn_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd - output = fwd_fn() + fwd_fn = { + "eager": huggingface_attn_fwd, + "torchcompile": torch.compile(huggingface_attn_fwd), + } + outputs = fwd_fn[executor]() grads = torch.randn(batch_size * nh, seq_len, seq_len, device="cuda", dtype=dtype) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=huggingface_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py index 27a013a8481..714a12e41d1 100644 --- a/benchmarks/python/test_huggingface_attn_fwd.py +++ b/benchmarks/python/test_huggingface_attn_fwd.py @@ -135,16 +135,16 @@ def test_huggingface_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [attention_mask, inputs]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -153,10 +153,15 @@ def test_huggingface_attn_fwd_baseline_benchmark( batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype ) + benchmark_fn = { + "eager": huggingface_attn_fwd, + "torchcompile": torch.compile(huggingface_attn_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd, + benchmark_fn[executor], [attention_mask, inputs, size, dropout_p], iobytes=huggingface_attn_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_instancenorm_bwd.py b/benchmarks/python/test_instancenorm_bwd.py index 99d3e3baf2b..4022c5f395f 100644 --- a/benchmarks/python/test_instancenorm_bwd.py +++ b/benchmarks/python/test_instancenorm_bwd.py @@ -30,13 +30,13 @@ def test_instancenorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_instancenorm_bwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_bwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "instance_norm" + benchmark, size, dtype, channels_last, executor, "instance_norm" ) diff --git a/benchmarks/python/test_instancenorm_fwd.py b/benchmarks/python/test_instancenorm_fwd.py index 3b8f6564f51..3335fcc7bbf 100644 --- a/benchmarks/python/test_instancenorm_fwd.py +++ b/benchmarks/python/test_instancenorm_fwd.py @@ -29,13 +29,13 @@ def test_instancenorm_fwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_instancenorm_fwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_fwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "instance_norm" + benchmark, size, dtype, channels_last, executor, "instance_norm" ) diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py index d76046575dc..926ab2ef0fb 100644 --- a/benchmarks/python/test_layernorm_bwd.py +++ b/benchmarks/python/test_layernorm_bwd.py @@ -146,16 +146,16 @@ def test_layernorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) @@ -171,13 +171,16 @@ def layernorm_fwd(): bias=bias, ) - fwd_fn = torch.compile(layernorm_fwd) if compile else layernorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": layernorm_fwd, + "torchcompile": torch.compile(layernorm_fwd), + } + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py index c6a5f24c8dc..52aa5838f62 100644 --- a/benchmarks/python/test_layernorm_fwd.py +++ b/benchmarks/python/test_layernorm_fwd.py @@ -106,16 +106,16 @@ def test_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, hidden_size = size inputs = [ @@ -124,10 +124,15 @@ def test_layernorm_fwd_baseline_benchmark( torch.randn(hidden_size, device="cuda", dtype=dtype), ] + benchmark_fn = { + "eager": layernorm_fwd, + "torchcompile": torch.compile(layernorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(layernorm_fwd) if compile else layernorm_fwd, + benchmark_fn[executor], inputs, iobytes=layernorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py index 865caba2e31..2448ac07fd9 100644 --- a/benchmarks/python/test_matmul.py +++ b/benchmarks/python/test_matmul.py @@ -25,14 +25,14 @@ def load_matmul_problems(): @pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"]) -@pytest.mark.parametrize("compile", [False], ids=["eager"]) +@pytest.mark.parametrize("executor", ["eager"]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"]) @pytest.mark.parametrize( "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val) ) def test_matmul_baseline_benchmark( benchmark, - compile: bool, + executor: str, config: tuple, dtype: torch.dtype, half_reduction: bool, diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py index 2efb8e7d58d..88d8d56e26d 100644 --- a/benchmarks/python/test_nanogpt_attn_bwd.py +++ b/benchmarks/python/test_nanogpt_attn_bwd.py @@ -124,16 +124,16 @@ def test_nanogpt_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -154,14 +154,18 @@ def nanogpt_attn_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd - output = fwd_fn() + fwd_fn = { + "eager": nanogpt_attn_fwd, + "torchcompile": torch.compile(nanogpt_attn_fwd), + } + outputs = fwd_fn[executor]() + grads = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=nanogpt_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py index 4dbd5821c59..5336d96cba5 100644 --- a/benchmarks/python/test_nanogpt_attn_fwd.py +++ b/benchmarks/python/test_nanogpt_attn_fwd.py @@ -137,16 +137,16 @@ def test_nanogpt_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -154,10 +154,16 @@ def test_nanogpt_attn_fwd_baseline_benchmark( bias = torch.tril(torch.ones(seq_len, seq_len, device="cuda")).view( 1, 1, seq_len, seq_len ) + + benchmark_fn = { + "eager": nanogpt_attn_fwd, + "torchcompile": torch.compile(nanogpt_attn_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd, + benchmark_fn[executor], [inputs, bias, size, dropout_p], iobytes=nanogpt_attn_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py index 0162950cc47..31ec20d6b10 100644 --- a/benchmarks/python/test_pointwise_mul.py +++ b/benchmarks/python/test_pointwise_mul.py @@ -50,21 +50,26 @@ def test_pointwise_mul_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_pointwise_mul_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": pointwise_mul_fwd_fn, + "torchcompile": torch.compile(pointwise_mul_fwd_fn), + } # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(pointwise_mul_fwd_fn) if compile else pointwise_mul_fwd_fn, + benchmark_fn[executor], [input], ) diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py index f734769a1e5..303f65609b7 100644 --- a/benchmarks/python/test_reduction.py +++ b/benchmarks/python/test_reduction.py @@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -62,14 +62,19 @@ def test_reduction_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": reduction_fwd_fn, + "torchcompile": torch.compile(reduction_fwd_fn), + } # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(reduction_fwd_fn) if compile else reduction_fwd_fn, + benchmark_fn[executor], [input, reduction_axis], ) diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py index 231090e4135..aacf7326d29 100644 --- a/benchmarks/python/test_reduction_epilogue.py +++ b/benchmarks/python/test_reduction_epilogue.py @@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, epilogue]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0]) @@ -76,17 +76,21 @@ def test_reduction_epilogue_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() x = torch.randn(size, device="cuda", dtype=dtype) epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype) # Inputs and outputs are same as nvFuser, no need for manual IOByte computation + + benchmark_fn = { + "eager": reduction_epilogue_fwd_fn, + "torchcompile": torch.compile(reduction_epilogue_fwd_fn), + } + run_benchmark( benchmark, - torch.compile(reduction_epilogue_fwd_fn) - if compile - else reduction_epilogue_fwd_fn, + benchmark_fn[executor], [x, epilogue, reduction_axis], ) diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py index 697aa8848ab..2fb4698fdbf 100644 --- a/benchmarks/python/test_rmsnorm_bwd.py +++ b/benchmarks/python/test_rmsnorm_bwd.py @@ -112,16 +112,16 @@ def test_rmsnorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, rms_eps, grads, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(size, device="cuda", dtype=dtype) @@ -134,13 +134,13 @@ def rmsnorm_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(rmsnorm_fwd) if compile else rmsnorm_fwd - output = fwd_fn() + fwd_fn = {"eager": rmsnorm_fwd, "torchcompile": torch.compile(rmsnorm_fwd)} + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py index b7839b631de..0114ae6507c 100644 --- a/benchmarks/python/test_rmsnorm_fwd.py +++ b/benchmarks/python/test_rmsnorm_fwd.py @@ -86,24 +86,28 @@ def test_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype) weights = torch.randn(size[1], device="cuda", dtype=dtype) + benchmark_fn = { + "eager": rmsnorm_fwd_fn, + "torchcompile": torch.compile(rmsnorm_fwd_fn), + } # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(rmsnorm_fwd_fn) if compile else rmsnorm_fwd_fn, + benchmark_fn[executor], [inputs, weights], iobytes=rmsnorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py index a85c62a1592..c98d32382b5 100644 --- a/benchmarks/python/test_scale_bias_relu_bwd.py +++ b/benchmarks/python/test_scale_bias_relu_bwd.py @@ -79,16 +79,16 @@ def test_sbr_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [scale, bool_mask, grads]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(*size, device="cuda", dtype=dtype) @@ -99,12 +99,12 @@ def sbr_fwd(): return torch.nn.functional.relu(inputs * scale + bias) # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(sbr_fwd) if compile else sbr_fwd - eager_output = sbr_fwd() + fwd_fn = {"eager": sbr_fwd, "torchcompile": torch.compile(sbr_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=sbr_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py index ede13dbb767..c09b11296c3 100644 --- a/benchmarks/python/test_scale_bias_relu_fwd.py +++ b/benchmarks/python/test_scale_bias_relu_fwd.py @@ -82,24 +82,26 @@ def test_sbr_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, scale, inputs]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) scale = torch.ones(size[-1], device="cuda", dtype=dtype) + benchmark_fn = {"eager": sbr_fwd_fn, "torchcompile": torch.compile(sbr_fwd_fn)} + run_benchmark( benchmark, - torch.compile(sbr_fwd_fn) if compile else sbr_fwd_fn, + benchmark_fn[executor], [bias, scale, inputs], iobytes=sbr_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py index 98995e860b1..25276dec474 100644 --- a/benchmarks/python/test_silu_mul_bwd.py +++ b/benchmarks/python/test_silu_mul_bwd.py @@ -79,16 +79,16 @@ def test_silu_mul_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, x, y]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() x = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) y = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) @@ -98,12 +98,12 @@ def silu_mul_fwd(): return torch.nn.functional.silu(x) * y # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(silu_mul_fwd) if compile else silu_mul_fwd - eager_output = fwd_fn() + fwd_fn = {"eager": silu_mul_fwd, "torchcompile": torch.compile(silu_mul_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=silu_mul_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py index 0f1e86d0d56..3de05067cb2 100644 --- a/benchmarks/python/test_silu_mul_fwd.py +++ b/benchmarks/python/test_silu_mul_fwd.py @@ -56,22 +56,27 @@ def test_silu_mul_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = [torch.randn(*size, device="cuda", dtype=dtype) for _ in range(2)] + benchmark_fn = { + "eager": silu_mul_fwd_fn, + "torchcompile": torch.compile(silu_mul_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(silu_mul_fwd_fn) if compile else silu_mul_fwd_fn, + benchmark_fn[executor], inputs, ) diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py index 86f22654380..049da18fe27 100644 --- a/benchmarks/python/test_softmax_bwd.py +++ b/benchmarks/python/test_softmax_bwd.py @@ -91,7 +91,7 @@ def test_softmax_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -100,9 +100,9 @@ def test_softmax_bwd_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(size, device="cuda", dtype=dtype) @@ -110,12 +110,12 @@ def test_softmax_bwd_baseline_benchmark( def softmax_fwd(): return torch.nn.functional.softmax(input, dim=reduction_axis) - fwd_fn = torch.compile(softmax_fwd) if compile else softmax_fwd - output = fwd_fn() + fwd_fn = {"eager": softmax_fwd, "torchcompile": torch.compile(softmax_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=softmax_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py index 2e672eb2e30..d138aa1ced1 100644 --- a/benchmarks/python/test_softmax_fwd.py +++ b/benchmarks/python/test_softmax_fwd.py @@ -81,7 +81,7 @@ def test_softmax_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -90,15 +90,19 @@ def test_softmax_fwd_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + benchmark_fn = { + "eager": softmax_fwd_fn, + "torchcompile": torch.compile(softmax_fwd_fn), + } run_benchmark( benchmark, - torch.compile(softmax_fwd_fn) if compile else softmax_fwd_fn, + benchmark_fn[executor], [input, reduction_axis], iobytes=softmax_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index cf290f278a5..a4e3198cc9a 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -74,7 +74,7 @@ def test_transpose_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=3)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)]) @@ -83,15 +83,21 @@ def test_transpose_baseline_benchmark( size: tuple, dtype: torch.dtype, axes: list, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input1 = torch.randn(size, device="cuda", dtype=dtype) input2 = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": transpose_fwd_fn, + "torchcompile": torch.compile(transpose_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(transpose_fwd_fn) if compile else transpose_fwd_fn, + benchmark_fn[executor], [input1, input2, axes[0], axes[1]], )