lintrunner

NVIDIA · Nov 5, 2024 · 9335863 · 9335863
1 parent 6c23c44
commit 9335863
Show file tree

Hide file tree

Showing 26 changed files with 54 additions and 78 deletions.
diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py
@@ -96,36 +96,33 @@ def pytest_configure(config):
 
 def pytest_collection_modifyitems(session, config, items):
     """
-    The baseline benchmarks use `executor` parameter with 
+    The baseline benchmarks use `executor` parameter with
     values ["eager", "torchcompile", "thunder"] that are optionally
-    run using `--benchmark-{executor}` flag. They are skipped by 
+    run using `--benchmark-{executor}` flag. They are skipped by
     default.
     """
 
     from nvfuser.pytorch_utils import retry_on_oom_or_skip_test
 
     def get_test_executor(item) -> str | None:
-        if (
-            hasattr(item, "callspec")
-            and "executor" in item.callspec.params
-        ):
+        if hasattr(item, "callspec") and "executor" in item.callspec.params:
             return item.callspec.params["executor"]
         return None
-    
+
     executors_to_skip = []
 
     for executor in ["eager", "torchcompile", "thunder"]:
         if not config.getoption(f"--benchmark-{executor}"):
             executors_to_skip.append(executor)
-    
+
     for item in items:
         item.obj = retry_on_oom_or_skip_test(item.obj)
-        
+
         test_executor = get_test_executor(item)
-        
+
         if test_executor is not None and test_executor in executors_to_skip:
             item.add_marker(
-                pytest.mark.skip(reason=f"need --benchmark-{test_executor} option to run.")
+                pytest.mark.skip(
+                    reason=f"need --benchmark-{test_executor} option to run."
+                )
             )
-
-
diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
@@ -452,11 +452,8 @@ def norm_fwd_baseline_benchmark(
         inputs = inputs.to(memory_format=torch.channels_last)
 
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
-
-    benchmark_fn = {
-        "eager": norm_fwd_fn,
-        "torchcompile": torch.compile(norm_fwd_fn)
-    }
+
+    benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
@@ -496,10 +493,7 @@ def norm_bwd_baseline_benchmark(
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = {
-        "eager": norm_fwd_fn,
-        "torchcompile": torch.compile(norm_fwd_fn)
-    }
+    fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
     outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var])
 
     # Manually compute IOBytes: See PR #1725

diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py
@@ -111,10 +111,10 @@ def test_bcast_add_baseline_benchmark(
     if not contiguous:
         x = x.t()
     assert x.is_contiguous() == contiguous
-    
+
     benchmark_fn = {
         "eager": bcast_add_fwd_fn,
-        "torchcompile": torch.compile(bcast_add_fwd_fn)
+        "torchcompile": torch.compile(bcast_add_fwd_fn),
     }
 
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation

diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py
@@ -220,7 +220,7 @@ def dropout_layernorm_fwd():
     fwd_fn = {
         "eager": dropout_layernorm_fwd,
         "torchcompile": torch.compile(dropout_layernorm_fwd),
-    } 
+    }
     outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725

diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py
@@ -180,11 +180,11 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
         torch.zeros(size[1], device="cuda", dtype=dtype),
         dropout_p,
     ]
-    
+
     benchmark_fn = {
         "eager": dropout_layernorm_fwd,
         "torchcompile": torch.compile(dropout_layernorm_fwd),
-    } 
+    }
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(

diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py
@@ -169,7 +169,7 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
         "eager": dropout_rmsnorm_fwd,
         "torchcompile": torch.compile(dropout_rmsnorm_fwd),
     }
-    
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py
@@ -120,12 +120,12 @@ def test_gelu_bwd_reduction_baseline_benchmark(
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     grads = torch.randn(size, device="cuda", dtype=dtype)
     eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh")
-    
+
     benchmark_fn = {
         "eager": gelu_bwd_reduction_torch,
         "torchcompile": torch.compile(gelu_bwd_reduction_torch),
     }
-    
+
     run_benchmark(
         benchmark,
         benchmark_fn[executor],

diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py
@@ -82,13 +82,11 @@ def test_gelu_fwd_baseline_benchmark(
         torch.randn(size, device="cuda", dtype=dtype, requires_grad=True),  # in_tensor
         torch.ones(size[-1], device="cuda", dtype=dtype),  # bias
     ]
-    
+
     benchmark_fn = {
         "eager": gelu_fwd_fn,
         "torchcompile": torch.compile(gelu_fwd_fn),
     }
-    
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
-    run_benchmark(
-        benchmark, benchmark_fn[executor], inputs
-    )
+    run_benchmark(benchmark, benchmark_fn[executor], inputs)
diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py
@@ -148,7 +148,9 @@ def test_groupnorm_fwd_baseline_benchmark(
     benchmark_fn = {
         "eager": groupnorm_fwd,
         "torchcompile": torch.compile(groupnorm_fwd),
-        "thunder": thunder.jit(groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex])
+        "thunder": thunder.jit(
+            groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
+        ),
     }
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py
@@ -136,7 +136,7 @@ def huggingface_attn_fwd():
     # Compile the fwd fn for torchcompile
     fwd_fn = {
         "eager": huggingface_attn_fwd,
-        "torchcompile": torch.compile(huggingface_attn_fwd)
+        "torchcompile": torch.compile(huggingface_attn_fwd),
     }
     outputs = fwd_fn[executor]()
     grads = torch.randn(batch_size * nh, seq_len, seq_len, device="cuda", dtype=dtype)

diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py
@@ -157,7 +157,7 @@ def test_huggingface_attn_fwd_baseline_benchmark(
         "eager": huggingface_attn_fwd,
         "torchcompile": torch.compile(huggingface_attn_fwd),
     }
-    
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py
@@ -128,7 +128,7 @@ def test_layernorm_fwd_baseline_benchmark(
         "eager": layernorm_fwd,
         "torchcompile": torch.compile(layernorm_fwd),
     }
-    
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py
@@ -159,7 +159,7 @@ def nanogpt_attn_fwd():
         "torchcompile": torch.compile(nanogpt_attn_fwd),
     }
     outputs = fwd_fn[executor]()
-    
+
     grads = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)
 
     # Manually compute IOBytes: See PR #1725

diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py
@@ -154,12 +154,12 @@ def test_nanogpt_attn_fwd_baseline_benchmark(
     bias = torch.tril(torch.ones(seq_len, seq_len, device="cuda")).view(
         1, 1, seq_len, seq_len
     )
-    
+
     benchmark_fn = {
         "eager": nanogpt_attn_fwd,
-        "torchcompile": torch.compile(nanogpt_attn_fwd)
+        "torchcompile": torch.compile(nanogpt_attn_fwd),
     }
-    
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py
@@ -62,10 +62,10 @@ def test_pointwise_mul_baseline_benchmark(
     if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
-    
+
     benchmark_fn = {
         "eager": pointwise_mul_fwd_fn,
-        "torchcompile": torch.compile(pointwise_mul_fwd_fn)
+        "torchcompile": torch.compile(pointwise_mul_fwd_fn),
     }
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(

diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py
@@ -67,10 +67,10 @@ def test_reduction_baseline_benchmark(
     if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
-    
+
     benchmark_fn = {
         "eager": reduction_fwd_fn,
-        "torchcompile": torch.compile(reduction_fwd_fn)
+        "torchcompile": torch.compile(reduction_fwd_fn),
     }
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(

diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py
@@ -83,12 +83,12 @@ def test_reduction_epilogue_baseline_benchmark(
     x = torch.randn(size, device="cuda", dtype=dtype)
     epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype)
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
-    
+
     benchmark_fn = {
         "eager": reduction_epilogue_fwd_fn,
-        "torchcompile": torch.compile(reduction_epilogue_fwd_fn)
+        "torchcompile": torch.compile(reduction_epilogue_fwd_fn),
     }
-    
+
     run_benchmark(
         benchmark,
         benchmark_fn[executor],

diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py
@@ -134,10 +134,7 @@ def rmsnorm_fwd():
         return output
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = {
-        "eager": rmsnorm_fwd,
-        "torchcompile": torch.compile(rmsnorm_fwd)
-    }
+    fwd_fn = {"eager": rmsnorm_fwd, "torchcompile": torch.compile(rmsnorm_fwd)}
     outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725

diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py
@@ -102,7 +102,7 @@ def test_rmsnorm_fwd_baseline_benchmark(
 
     benchmark_fn = {
         "eager": rmsnorm_fwd_fn,
-        "torchcompile": torch.compile(rmsnorm_fwd_fn)
+        "torchcompile": torch.compile(rmsnorm_fwd_fn),
     }
     # Manually compute IOBytes: See PR #1725
     run_benchmark(

diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py
@@ -99,10 +99,7 @@ def sbr_fwd():
         return torch.nn.functional.relu(inputs * scale + bias)
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = {
-        "eager": sbr_fwd,
-        "torchcompile": torch.compile(sbr_fwd)
-    }
+    fwd_fn = {"eager": sbr_fwd, "torchcompile": torch.compile(sbr_fwd)}
     outputs = fwd_fn[executor]()
 
     run_benchmark(

diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py
@@ -97,11 +97,8 @@ def test_sbr_fwd_baseline_benchmark(
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     scale = torch.ones(size[-1], device="cuda", dtype=dtype)
 
-    benchmark_fn = {
-        "eager": sbr_fwd_fn,
-        "torchcompile": torch.compile(sbr_fwd_fn)
-    }
-
+    benchmark_fn = {"eager": sbr_fwd_fn, "torchcompile": torch.compile(sbr_fwd_fn)}
+
     run_benchmark(
         benchmark,
         benchmark_fn[executor],

diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py
@@ -98,10 +98,7 @@ def silu_mul_fwd():
         return torch.nn.functional.silu(x) * y
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = {
-        "eager": silu_mul_fwd,
-        "torchcompile": torch.compile(silu_mul_fwd)
-    }
+    fwd_fn = {"eager": silu_mul_fwd, "torchcompile": torch.compile(silu_mul_fwd)}
     outputs = fwd_fn[executor]()
 
     run_benchmark(

diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py
@@ -71,9 +71,9 @@ def test_silu_mul_fwd_baseline_benchmark(
 
     benchmark_fn = {
         "eager": silu_mul_fwd_fn,
-        "torchcompile": torch.compile(silu_mul_fwd_fn)
+        "torchcompile": torch.compile(silu_mul_fwd_fn),
     }
-    
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py
@@ -110,10 +110,7 @@ def test_softmax_bwd_baseline_benchmark(
     def softmax_fwd():
         return torch.nn.functional.softmax(input, dim=reduction_axis)
 
-    fwd_fn = {
-        "eager": softmax_fwd,
-        "torchcompile": torch.compile(softmax_fwd)
-    }
+    fwd_fn = {"eager": softmax_fwd, "torchcompile": torch.compile(softmax_fwd)}
     outputs = fwd_fn[executor]()
 
     run_benchmark(

diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py
@@ -98,7 +98,7 @@ def test_softmax_fwd_baseline_benchmark(
 
     benchmark_fn = {
         "eager": softmax_fwd_fn,
-        "torchcompile": torch.compile(softmax_fwd_fn)
+        "torchcompile": torch.compile(softmax_fwd_fn),
     }
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
@@ -89,12 +89,12 @@ def test_transpose_baseline_benchmark(
         clear_dynamo_cache()
     input1 = torch.randn(size, device="cuda", dtype=dtype)
     input2 = torch.randn(size, device="cuda", dtype=dtype)
-    
+
     benchmark_fn = {
         "eager": transpose_fwd_fn,
-        "torchcompile": torch.compile(transpose_fwd_fn)
+        "torchcompile": torch.compile(transpose_fwd_fn),
     }
-    
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,