diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py
index 8932afbff30..03adbe1e7dd 100644
--- a/benchmarks/python/conftest.py
+++ b/benchmarks/python/conftest.py
@@ -96,45 +96,39 @@ def pytest_configure(config):
 
 def pytest_collection_modifyitems(session, config, items):
     """
-    The baseline benchmarks use `compile` parameter:
-        compile = false: Eager mode benchmark
-        compile = true: torch.compile benchmark
+    The baseline benchmarks use `executor` parameter with
+    values ["eager", "torchcompile", "thunder"] that are optionally
+    run using `--benchmark-{executor}` flag. They are skipped by
+    default.
     """
-    run_eager = config.getoption("--benchmark-eager")
-    run_thunder = config.getoption("--benchmark-thunder")
-    run_torchcompile = config.getoption("--benchmark-torchcompile")
 
     from nvfuser.pytorch_utils import retry_on_oom_or_skip_test
 
+    executors = ["eager", "torchcompile", "thunder"]
+
+    def get_test_executor(item) -> str | None:
+        if hasattr(item, "callspec") and "executor" in item.callspec.params:
+            test_executor = item.callspec.params["executor"]
+            assert (
+                test_executor in executors
+            ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}."
+            return test_executor
+        return None
+
+    executors_to_skip = []
+
+    for executor in executors:
+        if not config.getoption(f"--benchmark-{executor}"):
+            executors_to_skip.append(executor)
+
     for item in items:
         item.obj = retry_on_oom_or_skip_test(item.obj)
 
-    if not run_eager:
-        skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run")
-        for item in items:
-            # If the benchmark has compile=False parameter (eager mode), skip it.
-            if (
-                hasattr(item, "callspec")
-                and "compile" in item.callspec.params
-                and not item.callspec.params["compile"]
-            ):
-                item.add_marker(skip_eager)
-
-    if not run_torchcompile:
-        skip_torchcompile = pytest.mark.skip(
-            reason="need --benchmark-torchcompile option to run"
-        )
-        for item in items:
-            # If the benchmark has compile=True parameter (torch.compile mode), skip it.
-            if (
-                hasattr(item, "callspec")
-                and "compile" in item.callspec.params
-                and item.callspec.params["compile"]
-            ):
-                item.add_marker(skip_torchcompile)
-
-    if not run_thunder:
-        skip_thunder = pytest.mark.skip(reason="need --benchmark-thunder option to run")
-        for item in items:
-            if "thunder" in item.nodeid:
-                item.add_marker(skip_thunder)
+        test_executor = get_test_executor(item)
+
+        if test_executor is not None and test_executor in executors_to_skip:
+            item.add_marker(
+                pytest.mark.skip(
+                    reason=f"need --benchmark-{test_executor} option to run."
+                )
+            )
diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
index 8cbafe81353..6d493338846 100644
--- a/benchmarks/python/normalization.py
+++ b/benchmarks/python/normalization.py
@@ -433,10 +433,10 @@ def norm_fwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     channels_last: bool,
-    compile: bool,
+    executor: str,
     norm: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
@@ -453,10 +453,12 @@ def norm_fwd_baseline_benchmark(
 
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
+    benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(norm_fwd_fn) if compile else norm_fwd_fn,
+        benchmark_fn[executor],
         [inputs, weight, bias, running_mean, running_var],
         iobytes=norm_fwd_iobytes(size, dtype, norm),
     )
@@ -467,10 +469,10 @@ def norm_bwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     channels_last: bool,
-    compile: bool,
+    executor: str,
     norm: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
@@ -491,13 +493,13 @@ def norm_bwd_baseline_benchmark(
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
     # Compile the fwd fn for torchcompile
-    norm_fwd_fn = torch.compile(norm_fwd_fn) if compile else norm_fwd_fn
-    output = norm_fwd_fn([inputs, weight, bias, running_mean, running_var])
+    fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
+    outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var])
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=norm_bwd_iobytes(size, dtype, norm),
     )
diff --git a/benchmarks/python/test_batchnorm_bwd.py b/benchmarks/python/test_batchnorm_bwd.py
index 74242ba99e2..0a1cd64cc57 100644
--- a/benchmarks/python/test_batchnorm_bwd.py
+++ b/benchmarks/python/test_batchnorm_bwd.py
@@ -31,13 +31,13 @@ def test_batchnorm_bwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_batchnorm_bwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_bwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "batch_norm"
+        benchmark, size, dtype, channels_last, executor, "batch_norm"
     )
diff --git a/benchmarks/python/test_batchnorm_fwd.py b/benchmarks/python/test_batchnorm_fwd.py
index 47b3997770a..af197ce6f1b 100644
--- a/benchmarks/python/test_batchnorm_fwd.py
+++ b/benchmarks/python/test_batchnorm_fwd.py
@@ -31,13 +31,13 @@ def test_batchnorm_fwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_batchnorm_fwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_fwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "batch_norm"
+        benchmark, size, dtype, channels_last, executor, "batch_norm"
     )
diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py
index abb320ef2a3..65db1555b28 100644
--- a/benchmarks/python/test_broadcast_add_fwd.py
+++ b/benchmarks/python/test_broadcast_add_fwd.py
@@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [bias, x])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"])
@@ -101,9 +101,9 @@ def test_bcast_add_baseline_benchmark(
     dtype: torch.dtype,
     bcast_axis: int,
     contiguous: bool,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda")
     input_shape = size if contiguous else (size[1], size[0])
@@ -112,9 +112,14 @@ def test_bcast_add_baseline_benchmark(
         x = x.t()
     assert x.is_contiguous() == contiguous
 
+    benchmark_fn = {
+        "eager": bcast_add_fwd_fn,
+        "torchcompile": torch.compile(bcast_add_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(bcast_add_fwd_fn) if compile else bcast_add_fwd_fn,
+        benchmark_fn[executor],
         [bias, x, bcast_axis],
     )
diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py
index 6acaa012c5c..380a2085b09 100644
--- a/benchmarks/python/test_dropout_layernorm_bwd.py
+++ b/benchmarks/python/test_dropout_layernorm_bwd.py
@@ -189,16 +189,16 @@ def test_dropout_layernorm_bwd_nvf_benchmark(
         )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_layernorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     dropout_p = 0.2
@@ -217,13 +217,16 @@ def dropout_layernorm_fwd():
         )
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": dropout_layernorm_fwd,
+        "torchcompile": torch.compile(dropout_layernorm_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=dropout_layernorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py
index 47854fcd2d7..4408a2bd611 100644
--- a/benchmarks/python/test_dropout_layernorm_fwd.py
+++ b/benchmarks/python/test_dropout_layernorm_fwd.py
@@ -160,16 +160,16 @@ def test_dropout_layernorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_layernorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     dropout_p = 0.2
@@ -181,10 +181,15 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
         dropout_p,
     ]
 
+    benchmark_fn = {
+        "eager": dropout_layernorm_fwd,
+        "torchcompile": torch.compile(dropout_layernorm_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd,
+        benchmark_fn[executor],
         inputs,
         iobytes=dropout_layernorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py
index 8c61c51e2d9..d196e76f57b 100644
--- a/benchmarks/python/test_dropout_rmsnorm_bwd.py
+++ b/benchmarks/python/test_dropout_rmsnorm_bwd.py
@@ -169,16 +169,16 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark(
         )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_rmsnorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     dropout_p = 0.2
     input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
@@ -191,12 +191,15 @@ def dropout_rmsnorm_fwd():
         output = weights * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5)
         return output
 
-    fwd_fn = torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": dropout_rmsnorm_fwd,
+        "torchcompile": torch.compile(dropout_rmsnorm_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=dropout_rmsnorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py
index a93a8caf547..aea2674df9d 100644
--- a/benchmarks/python/test_dropout_rmsnorm_fwd.py
+++ b/benchmarks/python/test_dropout_rmsnorm_fwd.py
@@ -145,16 +145,16 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [input1, input2, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_rmsnorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     dropout_p = 0.2
 
@@ -165,10 +165,15 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
         dropout_p,
     ]
 
+    benchmark_fn = {
+        "eager": dropout_rmsnorm_fwd,
+        "torchcompile": torch.compile(dropout_rmsnorm_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd,
+        benchmark_fn[executor],
         inputs,
         iobytes=dropout_rmsnorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py
index 648f0317cf9..ffd0b25c6a2 100644
--- a/benchmarks/python/test_gelu_bwd.py
+++ b/benchmarks/python/test_gelu_bwd.py
@@ -88,16 +88,16 @@ def test_gelu_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, bias])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_gelu_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
@@ -106,12 +106,15 @@ def test_gelu_bwd_baseline_benchmark(
     def gelu_fwd():
         return torch.nn.functional.gelu(inputs + bias, approximate="tanh")
 
-    fwd_fn = torch.compile(gelu_fwd) if compile else gelu_fwd
-    eager_output = fwd_fn()
+    fwd_fn = {
+        "eager": gelu_fwd,
+        "torchcompile": torch.compile(gelu_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [eager_output, grads],
+        [outputs, grads],
         iobytes=gelu_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py
index 09dfd53d88a..e860826eb49 100644
--- a/benchmarks/python/test_gelu_bwd_reduction.py
+++ b/benchmarks/python/test_gelu_bwd_reduction.py
@@ -103,7 +103,7 @@ def test_gelu_bwd_reduction_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, bias])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -112,19 +112,23 @@ def test_gelu_bwd_reduction_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     grads = torch.randn(size, device="cuda", dtype=dtype)
     eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh")
+
+    benchmark_fn = {
+        "eager": gelu_bwd_reduction_torch,
+        "torchcompile": torch.compile(gelu_bwd_reduction_torch),
+    }
+
     run_benchmark(
         benchmark,
-        torch.compile(gelu_bwd_reduction_torch)
-        if compile
-        else gelu_bwd_reduction_torch,
+        benchmark_fn[executor],
         [eager_output, grads, inputs, reduction_axis],
         iobytes=gelu_bwd_reduction_iobytes(size, dtype, reduction_axis),
     )
diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py
index fa5f891ef8a..2f208b2c090 100644
--- a/benchmarks/python/test_gelu_fwd.py
+++ b/benchmarks/python/test_gelu_fwd.py
@@ -67,22 +67,26 @@ def test_gelu_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_gelu_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = [
         torch.randn(size, device="cuda", dtype=dtype, requires_grad=True),  # in_tensor
         torch.ones(size[-1], device="cuda", dtype=dtype),  # bias
     ]
+
+    benchmark_fn = {
+        "eager": gelu_fwd_fn,
+        "torchcompile": torch.compile(gelu_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
-    run_benchmark(
-        benchmark, torch.compile(gelu_fwd_fn) if compile else gelu_fwd_fn, inputs
-    )
+    run_benchmark(benchmark, benchmark_fn[executor], inputs)
diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py
index af4c023d7d7..8c729e115d7 100644
--- a/benchmarks/python/test_groupnorm_fwd.py
+++ b/benchmarks/python/test_groupnorm_fwd.py
@@ -128,35 +128,16 @@ def test_groupnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [x, weight, bias])
 
 
-@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
-@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
-def test_groupnorm_fwd_thunder_benchmark(
-    benchmark,
-    size: tuple,
-    dtype: torch.dtype,
-):
-    N, C, H, W = size
-    x = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
-    weight = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True)
-    bias = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True)
-    num_groups = get_n_groups(C)
-    # thunder compiled model
-    groupnorm_fwd_jit = thunder.jit(
-        groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
-    )
-    run_benchmark(benchmark, groupnorm_fwd_jit, [x, weight, bias, num_groups])
-
-
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_groupnorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     N, C, H, W = size
     x = torch.randn(size, device="cuda", dtype=dtype)
@@ -164,8 +145,15 @@ def test_groupnorm_fwd_baseline_benchmark(
     bias = torch.randn(C, device="cuda", dtype=dtype)
     num_groups = get_n_groups(C)
 
+    benchmark_fn = {
+        "eager": groupnorm_fwd,
+        "torchcompile": torch.compile(groupnorm_fwd),
+        "thunder": thunder.jit(
+            groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
+        ),
+    }
     run_benchmark(
         benchmark,
-        torch.compile(groupnorm_fwd) if compile else groupnorm_fwd,
+        benchmark_fn[executor],
         [x, weight, bias, num_groups],
     )
diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py
index dd8c9f80114..bcb2b4d9268 100644
--- a/benchmarks/python/test_huggingface_attn_bwd.py
+++ b/benchmarks/python/test_huggingface_attn_bwd.py
@@ -107,16 +107,16 @@ def test_huggingface_attn_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_huggingface_attn_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -134,14 +134,17 @@ def huggingface_attn_fwd():
         return output
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": huggingface_attn_fwd,
+        "torchcompile": torch.compile(huggingface_attn_fwd),
+    }
+    outputs = fwd_fn[executor]()
     grads = torch.randn(batch_size * nh, seq_len, seq_len, device="cuda", dtype=dtype)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=huggingface_attn_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py
index 27a013a8481..714a12e41d1 100644
--- a/benchmarks/python/test_huggingface_attn_fwd.py
+++ b/benchmarks/python/test_huggingface_attn_fwd.py
@@ -135,16 +135,16 @@ def test_huggingface_attn_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [attention_mask, inputs])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_huggingface_attn_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -153,10 +153,15 @@ def test_huggingface_attn_fwd_baseline_benchmark(
         batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype
     )
 
+    benchmark_fn = {
+        "eager": huggingface_attn_fwd,
+        "torchcompile": torch.compile(huggingface_attn_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd,
+        benchmark_fn[executor],
         [attention_mask, inputs, size, dropout_p],
         iobytes=huggingface_attn_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_instancenorm_bwd.py b/benchmarks/python/test_instancenorm_bwd.py
index 99d3e3baf2b..4022c5f395f 100644
--- a/benchmarks/python/test_instancenorm_bwd.py
+++ b/benchmarks/python/test_instancenorm_bwd.py
@@ -30,13 +30,13 @@ def test_instancenorm_bwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_instancenorm_bwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_bwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "instance_norm"
+        benchmark, size, dtype, channels_last, executor, "instance_norm"
     )
diff --git a/benchmarks/python/test_instancenorm_fwd.py b/benchmarks/python/test_instancenorm_fwd.py
index 3b8f6564f51..3335fcc7bbf 100644
--- a/benchmarks/python/test_instancenorm_fwd.py
+++ b/benchmarks/python/test_instancenorm_fwd.py
@@ -29,13 +29,13 @@ def test_instancenorm_fwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_instancenorm_fwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_fwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "instance_norm"
+        benchmark, size, dtype, channels_last, executor, "instance_norm"
     )
diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py
index d76046575dc..926ab2ef0fb 100644
--- a/benchmarks/python/test_layernorm_bwd.py
+++ b/benchmarks/python/test_layernorm_bwd.py
@@ -146,16 +146,16 @@ def test_layernorm_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_layernorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
@@ -171,13 +171,16 @@ def layernorm_fwd():
             bias=bias,
         )
 
-    fwd_fn = torch.compile(layernorm_fwd) if compile else layernorm_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": layernorm_fwd,
+        "torchcompile": torch.compile(layernorm_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=layernorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py
index c6a5f24c8dc..52aa5838f62 100644
--- a/benchmarks/python/test_layernorm_fwd.py
+++ b/benchmarks/python/test_layernorm_fwd.py
@@ -106,16 +106,16 @@ def test_layernorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_layernorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, hidden_size = size
     inputs = [
@@ -124,10 +124,15 @@ def test_layernorm_fwd_baseline_benchmark(
         torch.randn(hidden_size, device="cuda", dtype=dtype),
     ]
 
+    benchmark_fn = {
+        "eager": layernorm_fwd,
+        "torchcompile": torch.compile(layernorm_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(layernorm_fwd) if compile else layernorm_fwd,
+        benchmark_fn[executor],
         inputs,
         iobytes=layernorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 865caba2e31..2448ac07fd9 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -25,14 +25,14 @@ def load_matmul_problems():
 
 
 @pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
-@pytest.mark.parametrize("compile", [False], ids=["eager"])
+@pytest.mark.parametrize("executor", ["eager"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
 @pytest.mark.parametrize(
     "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
 )
 def test_matmul_baseline_benchmark(
     benchmark,
-    compile: bool,
+    executor: str,
     config: tuple,
     dtype: torch.dtype,
     half_reduction: bool,
diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py
index 2efb8e7d58d..88d8d56e26d 100644
--- a/benchmarks/python/test_nanogpt_attn_bwd.py
+++ b/benchmarks/python/test_nanogpt_attn_bwd.py
@@ -124,16 +124,16 @@ def test_nanogpt_attn_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_nanogpt_attn_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -154,14 +154,18 @@ def nanogpt_attn_fwd():
         return output
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": nanogpt_attn_fwd,
+        "torchcompile": torch.compile(nanogpt_attn_fwd),
+    }
+    outputs = fwd_fn[executor]()
+
     grads = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=nanogpt_attn_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py
index 4dbd5821c59..5336d96cba5 100644
--- a/benchmarks/python/test_nanogpt_attn_fwd.py
+++ b/benchmarks/python/test_nanogpt_attn_fwd.py
@@ -137,16 +137,16 @@ def test_nanogpt_attn_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, bias])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_nanogpt_attn_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -154,10 +154,16 @@ def test_nanogpt_attn_fwd_baseline_benchmark(
     bias = torch.tril(torch.ones(seq_len, seq_len, device="cuda")).view(
         1, 1, seq_len, seq_len
     )
+
+    benchmark_fn = {
+        "eager": nanogpt_attn_fwd,
+        "torchcompile": torch.compile(nanogpt_attn_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd,
+        benchmark_fn[executor],
         [inputs, bias, size, dropout_p],
         iobytes=nanogpt_attn_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py
index 0162950cc47..31ec20d6b10 100644
--- a/benchmarks/python/test_pointwise_mul.py
+++ b/benchmarks/python/test_pointwise_mul.py
@@ -50,21 +50,26 @@ def test_pointwise_mul_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_pointwise_mul_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
+
+    benchmark_fn = {
+        "eager": pointwise_mul_fwd_fn,
+        "torchcompile": torch.compile(pointwise_mul_fwd_fn),
+    }
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(pointwise_mul_fwd_fn) if compile else pointwise_mul_fwd_fn,
+        benchmark_fn[executor],
         [input],
     )
diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py
index f734769a1e5..303f65609b7 100644
--- a/benchmarks/python/test_reduction.py
+++ b/benchmarks/python/test_reduction.py
@@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -62,14 +62,19 @@ def test_reduction_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
+
+    benchmark_fn = {
+        "eager": reduction_fwd_fn,
+        "torchcompile": torch.compile(reduction_fwd_fn),
+    }
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(reduction_fwd_fn) if compile else reduction_fwd_fn,
+        benchmark_fn[executor],
         [input, reduction_axis],
     )
diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py
index 231090e4135..aacf7326d29 100644
--- a/benchmarks/python/test_reduction_epilogue.py
+++ b/benchmarks/python/test_reduction_epilogue.py
@@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [x, epilogue])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0])
@@ -76,17 +76,21 @@ def test_reduction_epilogue_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     x = torch.randn(size, device="cuda", dtype=dtype)
     epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype)
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
+
+    benchmark_fn = {
+        "eager": reduction_epilogue_fwd_fn,
+        "torchcompile": torch.compile(reduction_epilogue_fwd_fn),
+    }
+
     run_benchmark(
         benchmark,
-        torch.compile(reduction_epilogue_fwd_fn)
-        if compile
-        else reduction_epilogue_fwd_fn,
+        benchmark_fn[executor],
         [x, epilogue, reduction_axis],
     )
diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py
index 697aa8848ab..2fb4698fdbf 100644
--- a/benchmarks/python/test_rmsnorm_bwd.py
+++ b/benchmarks/python/test_rmsnorm_bwd.py
@@ -112,16 +112,16 @@ def test_rmsnorm_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, rms_eps, grads, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_rmsnorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
@@ -134,13 +134,13 @@ def rmsnorm_fwd():
         return output
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(rmsnorm_fwd) if compile else rmsnorm_fwd
-    output = fwd_fn()
+    fwd_fn = {"eager": rmsnorm_fwd, "torchcompile": torch.compile(rmsnorm_fwd)}
+    outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=rmsnorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py
index b7839b631de..0114ae6507c 100644
--- a/benchmarks/python/test_rmsnorm_fwd.py
+++ b/benchmarks/python/test_rmsnorm_fwd.py
@@ -86,24 +86,28 @@ def test_rmsnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_rmsnorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype)
     weights = torch.randn(size[1], device="cuda", dtype=dtype)
 
+    benchmark_fn = {
+        "eager": rmsnorm_fwd_fn,
+        "torchcompile": torch.compile(rmsnorm_fwd_fn),
+    }
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(rmsnorm_fwd_fn) if compile else rmsnorm_fwd_fn,
+        benchmark_fn[executor],
         [inputs, weights],
         iobytes=rmsnorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py
index a85c62a1592..c98d32382b5 100644
--- a/benchmarks/python/test_scale_bias_relu_bwd.py
+++ b/benchmarks/python/test_scale_bias_relu_bwd.py
@@ -79,16 +79,16 @@ def test_sbr_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [scale, bool_mask, grads])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_sbr_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(*size, device="cuda", dtype=dtype)
@@ -99,12 +99,12 @@ def sbr_fwd():
         return torch.nn.functional.relu(inputs * scale + bias)
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(sbr_fwd) if compile else sbr_fwd
-    eager_output = sbr_fwd()
+    fwd_fn = {"eager": sbr_fwd, "torchcompile": torch.compile(sbr_fwd)}
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [eager_output, grads],
+        [outputs, grads],
         iobytes=sbr_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py
index ede13dbb767..c09b11296c3 100644
--- a/benchmarks/python/test_scale_bias_relu_fwd.py
+++ b/benchmarks/python/test_scale_bias_relu_fwd.py
@@ -82,24 +82,26 @@ def test_sbr_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [bias, scale, inputs])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_sbr_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     scale = torch.ones(size[-1], device="cuda", dtype=dtype)
 
+    benchmark_fn = {"eager": sbr_fwd_fn, "torchcompile": torch.compile(sbr_fwd_fn)}
+
     run_benchmark(
         benchmark,
-        torch.compile(sbr_fwd_fn) if compile else sbr_fwd_fn,
+        benchmark_fn[executor],
         [bias, scale, inputs],
         iobytes=sbr_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py
index 98995e860b1..25276dec474 100644
--- a/benchmarks/python/test_silu_mul_bwd.py
+++ b/benchmarks/python/test_silu_mul_bwd.py
@@ -79,16 +79,16 @@ def test_silu_mul_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, x, y])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_silu_mul_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     x = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     y = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
@@ -98,12 +98,12 @@ def silu_mul_fwd():
         return torch.nn.functional.silu(x) * y
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(silu_mul_fwd) if compile else silu_mul_fwd
-    eager_output = fwd_fn()
+    fwd_fn = {"eager": silu_mul_fwd, "torchcompile": torch.compile(silu_mul_fwd)}
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [eager_output, grads],
+        [outputs, grads],
         iobytes=silu_mul_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py
index 0f1e86d0d56..3de05067cb2 100644
--- a/benchmarks/python/test_silu_mul_fwd.py
+++ b/benchmarks/python/test_silu_mul_fwd.py
@@ -56,22 +56,27 @@ def test_silu_mul_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_silu_mul_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = [torch.randn(*size, device="cuda", dtype=dtype) for _ in range(2)]
 
+    benchmark_fn = {
+        "eager": silu_mul_fwd_fn,
+        "torchcompile": torch.compile(silu_mul_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(silu_mul_fwd_fn) if compile else silu_mul_fwd_fn,
+        benchmark_fn[executor],
         inputs,
     )
diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py
index 86f22654380..049da18fe27 100644
--- a/benchmarks/python/test_softmax_bwd.py
+++ b/benchmarks/python/test_softmax_bwd.py
@@ -91,7 +91,7 @@ def test_softmax_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -100,9 +100,9 @@ def test_softmax_bwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
@@ -110,12 +110,12 @@ def test_softmax_bwd_baseline_benchmark(
     def softmax_fwd():
         return torch.nn.functional.softmax(input, dim=reduction_axis)
 
-    fwd_fn = torch.compile(softmax_fwd) if compile else softmax_fwd
-    output = fwd_fn()
+    fwd_fn = {"eager": softmax_fwd, "torchcompile": torch.compile(softmax_fwd)}
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=softmax_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py
index 2e672eb2e30..d138aa1ced1 100644
--- a/benchmarks/python/test_softmax_fwd.py
+++ b/benchmarks/python/test_softmax_fwd.py
@@ -81,7 +81,7 @@ def test_softmax_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -90,15 +90,19 @@ def test_softmax_fwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
 
+    benchmark_fn = {
+        "eager": softmax_fwd_fn,
+        "torchcompile": torch.compile(softmax_fwd_fn),
+    }
     run_benchmark(
         benchmark,
-        torch.compile(softmax_fwd_fn) if compile else softmax_fwd_fn,
+        benchmark_fn[executor],
         [input, reduction_axis],
         iobytes=softmax_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index cf290f278a5..a4e3198cc9a 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -74,7 +74,7 @@ def test_transpose_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [input1, input2])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=3))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)])
@@ -83,15 +83,21 @@ def test_transpose_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     axes: list,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input1 = torch.randn(size, device="cuda", dtype=dtype)
     input2 = torch.randn(size, device="cuda", dtype=dtype)
+
+    benchmark_fn = {
+        "eager": transpose_fwd_fn,
+        "torchcompile": torch.compile(transpose_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(transpose_fwd_fn) if compile else transpose_fwd_fn,
+        benchmark_fn[executor],
         [input1, input2, axes[0], axes[1]],
     )