Skip to content

Commit

Permalink
lintrunner
Browse files Browse the repository at this point in the history
  • Loading branch information
Priya2698 committed Nov 5, 2024
1 parent 6c23c44 commit 9335863
Show file tree
Hide file tree
Showing 26 changed files with 54 additions and 78 deletions.
23 changes: 10 additions & 13 deletions benchmarks/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,36 +96,33 @@ def pytest_configure(config):

def pytest_collection_modifyitems(session, config, items):
"""
The baseline benchmarks use `executor` parameter with
The baseline benchmarks use `executor` parameter with
values ["eager", "torchcompile", "thunder"] that are optionally
run using `--benchmark-{executor}` flag. They are skipped by
run using `--benchmark-{executor}` flag. They are skipped by
default.
"""

from nvfuser.pytorch_utils import retry_on_oom_or_skip_test

def get_test_executor(item) -> str | None:
if (
hasattr(item, "callspec")
and "executor" in item.callspec.params
):
if hasattr(item, "callspec") and "executor" in item.callspec.params:
return item.callspec.params["executor"]
return None

executors_to_skip = []

for executor in ["eager", "torchcompile", "thunder"]:
if not config.getoption(f"--benchmark-{executor}"):
executors_to_skip.append(executor)

for item in items:
item.obj = retry_on_oom_or_skip_test(item.obj)

test_executor = get_test_executor(item)

if test_executor is not None and test_executor in executors_to_skip:
item.add_marker(
pytest.mark.skip(reason=f"need --benchmark-{test_executor} option to run.")
pytest.mark.skip(
reason=f"need --benchmark-{test_executor} option to run."
)
)


12 changes: 3 additions & 9 deletions benchmarks/python/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,11 +452,8 @@ def norm_fwd_baseline_benchmark(
inputs = inputs.to(memory_format=torch.channels_last)

norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn

benchmark_fn = {
"eager": norm_fwd_fn,
"torchcompile": torch.compile(norm_fwd_fn)
}

benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}

# Manually compute IOBytes: See PR #1725
run_benchmark(
Expand Down Expand Up @@ -496,10 +493,7 @@ def norm_bwd_baseline_benchmark(
norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn

# Compile the fwd fn for torchcompile
fwd_fn = {
"eager": norm_fwd_fn,
"torchcompile": torch.compile(norm_fwd_fn)
}
fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var])

# Manually compute IOBytes: See PR #1725
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_broadcast_add_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@ def test_bcast_add_baseline_benchmark(
if not contiguous:
x = x.t()
assert x.is_contiguous() == contiguous

benchmark_fn = {
"eager": bcast_add_fwd_fn,
"torchcompile": torch.compile(bcast_add_fwd_fn)
"torchcompile": torch.compile(bcast_add_fwd_fn),
}

# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_dropout_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def dropout_layernorm_fwd():
fwd_fn = {
"eager": dropout_layernorm_fwd,
"torchcompile": torch.compile(dropout_layernorm_fwd),
}
}
outputs = fwd_fn[executor]()

# Manually compute IOBytes: See PR #1725
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_dropout_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,11 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
torch.zeros(size[1], device="cuda", dtype=dtype),
dropout_p,
]

benchmark_fn = {
"eager": dropout_layernorm_fwd,
"torchcompile": torch.compile(dropout_layernorm_fwd),
}
}

# Manually compute IOBytes: See PR #1725
run_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_dropout_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
"eager": dropout_rmsnorm_fwd,
"torchcompile": torch.compile(dropout_rmsnorm_fwd),
}

# Manually compute IOBytes: See PR #1725
run_benchmark(
benchmark,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_gelu_bwd_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,12 @@ def test_gelu_bwd_reduction_baseline_benchmark(
bias = torch.ones(size[-1], device="cuda", dtype=dtype)
grads = torch.randn(size, device="cuda", dtype=dtype)
eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh")

benchmark_fn = {
"eager": gelu_bwd_reduction_torch,
"torchcompile": torch.compile(gelu_bwd_reduction_torch),
}

run_benchmark(
benchmark,
benchmark_fn[executor],
Expand Down
8 changes: 3 additions & 5 deletions benchmarks/python/test_gelu_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,11 @@ def test_gelu_fwd_baseline_benchmark(
torch.randn(size, device="cuda", dtype=dtype, requires_grad=True), # in_tensor
torch.ones(size[-1], device="cuda", dtype=dtype), # bias
]

benchmark_fn = {
"eager": gelu_fwd_fn,
"torchcompile": torch.compile(gelu_fwd_fn),
}

# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
benchmark, benchmark_fn[executor], inputs
)
run_benchmark(benchmark, benchmark_fn[executor], inputs)
4 changes: 3 additions & 1 deletion benchmarks/python/test_groupnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ def test_groupnorm_fwd_baseline_benchmark(
benchmark_fn = {
"eager": groupnorm_fwd,
"torchcompile": torch.compile(groupnorm_fwd),
"thunder": thunder.jit(groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex])
"thunder": thunder.jit(
groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
),
}
run_benchmark(
benchmark,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_huggingface_attn_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def huggingface_attn_fwd():
# Compile the fwd fn for torchcompile
fwd_fn = {
"eager": huggingface_attn_fwd,
"torchcompile": torch.compile(huggingface_attn_fwd)
"torchcompile": torch.compile(huggingface_attn_fwd),
}
outputs = fwd_fn[executor]()
grads = torch.randn(batch_size * nh, seq_len, seq_len, device="cuda", dtype=dtype)
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_huggingface_attn_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def test_huggingface_attn_fwd_baseline_benchmark(
"eager": huggingface_attn_fwd,
"torchcompile": torch.compile(huggingface_attn_fwd),
}

# Manually compute IOBytes: See PR #1725
run_benchmark(
benchmark,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def test_layernorm_fwd_baseline_benchmark(
"eager": layernorm_fwd,
"torchcompile": torch.compile(layernorm_fwd),
}

# Manually compute IOBytes: See PR #1725
run_benchmark(
benchmark,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_nanogpt_attn_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def nanogpt_attn_fwd():
"torchcompile": torch.compile(nanogpt_attn_fwd),
}
outputs = fwd_fn[executor]()

grads = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)

# Manually compute IOBytes: See PR #1725
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/python/test_nanogpt_attn_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,12 @@ def test_nanogpt_attn_fwd_baseline_benchmark(
bias = torch.tril(torch.ones(seq_len, seq_len, device="cuda")).view(
1, 1, seq_len, seq_len
)

benchmark_fn = {
"eager": nanogpt_attn_fwd,
"torchcompile": torch.compile(nanogpt_attn_fwd)
"torchcompile": torch.compile(nanogpt_attn_fwd),
}

# Manually compute IOBytes: See PR #1725
run_benchmark(
benchmark,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_pointwise_mul.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ def test_pointwise_mul_baseline_benchmark(
if executor == "torchcompile":
clear_dynamo_cache()
input = torch.randn(size, device="cuda", dtype=dtype)

benchmark_fn = {
"eager": pointwise_mul_fwd_fn,
"torchcompile": torch.compile(pointwise_mul_fwd_fn)
"torchcompile": torch.compile(pointwise_mul_fwd_fn),
}
# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ def test_reduction_baseline_benchmark(
if executor == "torchcompile":
clear_dynamo_cache()
input = torch.randn(size, device="cuda", dtype=dtype)

benchmark_fn = {
"eager": reduction_fwd_fn,
"torchcompile": torch.compile(reduction_fwd_fn)
"torchcompile": torch.compile(reduction_fwd_fn),
}
# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/python/test_reduction_epilogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,12 @@ def test_reduction_epilogue_baseline_benchmark(
x = torch.randn(size, device="cuda", dtype=dtype)
epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype)
# Inputs and outputs are same as nvFuser, no need for manual IOByte computation

benchmark_fn = {
"eager": reduction_epilogue_fwd_fn,
"torchcompile": torch.compile(reduction_epilogue_fwd_fn)
"torchcompile": torch.compile(reduction_epilogue_fwd_fn),
}

run_benchmark(
benchmark,
benchmark_fn[executor],
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_rmsnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,7 @@ def rmsnorm_fwd():
return output

# Compile the fwd fn for torchcompile
fwd_fn = {
"eager": rmsnorm_fwd,
"torchcompile": torch.compile(rmsnorm_fwd)
}
fwd_fn = {"eager": rmsnorm_fwd, "torchcompile": torch.compile(rmsnorm_fwd)}
outputs = fwd_fn[executor]()

# Manually compute IOBytes: See PR #1725
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_rmsnorm_fwd_baseline_benchmark(

benchmark_fn = {
"eager": rmsnorm_fwd_fn,
"torchcompile": torch.compile(rmsnorm_fwd_fn)
"torchcompile": torch.compile(rmsnorm_fwd_fn),
}
# Manually compute IOBytes: See PR #1725
run_benchmark(
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_scale_bias_relu_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,7 @@ def sbr_fwd():
return torch.nn.functional.relu(inputs * scale + bias)

# Compile the fwd fn for torchcompile
fwd_fn = {
"eager": sbr_fwd,
"torchcompile": torch.compile(sbr_fwd)
}
fwd_fn = {"eager": sbr_fwd, "torchcompile": torch.compile(sbr_fwd)}
outputs = fwd_fn[executor]()

run_benchmark(
Expand Down
7 changes: 2 additions & 5 deletions benchmarks/python/test_scale_bias_relu_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,8 @@ def test_sbr_fwd_baseline_benchmark(
bias = torch.ones(size[-1], device="cuda", dtype=dtype)
scale = torch.ones(size[-1], device="cuda", dtype=dtype)

benchmark_fn = {
"eager": sbr_fwd_fn,
"torchcompile": torch.compile(sbr_fwd_fn)
}

benchmark_fn = {"eager": sbr_fwd_fn, "torchcompile": torch.compile(sbr_fwd_fn)}

run_benchmark(
benchmark,
benchmark_fn[executor],
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_silu_mul_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,7 @@ def silu_mul_fwd():
return torch.nn.functional.silu(x) * y

# Compile the fwd fn for torchcompile
fwd_fn = {
"eager": silu_mul_fwd,
"torchcompile": torch.compile(silu_mul_fwd)
}
fwd_fn = {"eager": silu_mul_fwd, "torchcompile": torch.compile(silu_mul_fwd)}
outputs = fwd_fn[executor]()

run_benchmark(
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/python/test_silu_mul_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ def test_silu_mul_fwd_baseline_benchmark(

benchmark_fn = {
"eager": silu_mul_fwd_fn,
"torchcompile": torch.compile(silu_mul_fwd_fn)
"torchcompile": torch.compile(silu_mul_fwd_fn),
}

# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
benchmark,
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_softmax_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,7 @@ def test_softmax_bwd_baseline_benchmark(
def softmax_fwd():
return torch.nn.functional.softmax(input, dim=reduction_axis)

fwd_fn = {
"eager": softmax_fwd,
"torchcompile": torch.compile(softmax_fwd)
}
fwd_fn = {"eager": softmax_fwd, "torchcompile": torch.compile(softmax_fwd)}
outputs = fwd_fn[executor]()

run_benchmark(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/test_softmax_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_softmax_fwd_baseline_benchmark(

benchmark_fn = {
"eager": softmax_fwd_fn,
"torchcompile": torch.compile(softmax_fwd_fn)
"torchcompile": torch.compile(softmax_fwd_fn),
}
run_benchmark(
benchmark,
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/python/test_transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@ def test_transpose_baseline_benchmark(
clear_dynamo_cache()
input1 = torch.randn(size, device="cuda", dtype=dtype)
input2 = torch.randn(size, device="cuda", dtype=dtype)

benchmark_fn = {
"eager": transpose_fwd_fn,
"torchcompile": torch.compile(transpose_fwd_fn)
"torchcompile": torch.compile(transpose_fwd_fn),
}

# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
benchmark,
Expand Down

0 comments on commit 9335863

Please sign in to comment.