Merge remote-tracking branch 'origin/main' into resize_scheduler

NVIDIA · Dec 13, 2024 · 498b5f9 · 498b5f9
2 parents 8c4b919 + 7d1b64b
commit 498b5f9
Show file tree

Hide file tree

Showing 117 changed files with 4,934 additions and 2,112 deletions.
diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
@@ -10,6 +10,9 @@
 from nvfuser import FusionDefinition, FusionCache
 from nvfuser.pytorch_utils import DEVICE_PROPERTIES
 import warnings
+import thunder
+from thunder.executors.nvfuserex import nvfuserex
+
 
 # These variables can be overwritten through CLI commands
 # --benchmark-rounds=rounds --benchmark-warmup-rounds=warmup_rounds
@@ -44,6 +47,16 @@ def unary_bwd_torch(inputs: List):  # [output, grad_out]
     inputs[0].backward(inputs[1], retain_graph=True)
 
 
+def with_executor(executor: str, fwd_fn: Callable) -> Callable:
+    assert executor in ["eager", "torchcompile", "thunder"]
+    if executor == "eager":
+        return fwd_fn
+    if executor == "torchcompile":
+        return torch.compile(fwd_fn)
+    if executor == "thunder":
+        return thunder.jit(fwd_fn, nv_enable_bookend=False, executors=[nvfuserex])
+
+
 def compute_total_iobytes(
     tensor_props: dict[str, tuple[int | tuple[int, ...], torch.dtype]]
 ):

diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
@@ -5,7 +5,7 @@
 from .global_params import PROMOTE_DTYPES
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 import torch
-from .core import run_benchmark, unary_bwd_torch, clear_dynamo_cache
+from .core import run_benchmark, unary_bwd_torch, clear_dynamo_cache, with_executor
 import numpy as np
 
 
@@ -453,12 +453,12 @@ def norm_fwd_baseline_benchmark(
 
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
-    benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
+    benchmark_fn = with_executor(executor, norm_fwd_fn)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        benchmark_fn[executor],
+        benchmark_fn,
         [inputs, weight, bias, running_mean, running_var],
         iobytes=norm_fwd_iobytes(size, dtype, norm),
     )
@@ -493,8 +493,9 @@ def norm_bwd_baseline_benchmark(
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
-    outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var])
+    fwd_fn = with_executor(executor, norm_fwd_fn)
+    fwd_inputs = [inputs, weight, bias, running_mean, running_var]
+    outputs = fwd_fn(fwd_inputs)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(

diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_dynamo_cache
+from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -112,14 +112,11 @@ def test_bcast_add_baseline_benchmark(
         x = x.t()
     assert x.is_contiguous() == contiguous
 
-    benchmark_fn = {
-        "eager": bcast_add_fwd_fn,
-        "torchcompile": torch.compile(bcast_add_fwd_fn),
-    }
+    benchmark_fn = with_executor(executor, bcast_add_fwd_fn)
 
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        benchmark_fn[executor],
+        benchmark_fn,
         [bias, x, bcast_axis],
     )
diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py
@@ -9,9 +9,11 @@
     clear_dynamo_cache,
     unary_bwd_torch,
     compute_total_iobytes,
+    with_executor,
 )
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .torch_ops import dropout_layernorm
 
 
 def dropout_layernorm_bwd_fusion(
@@ -208,20 +210,10 @@ def test_dropout_layernorm_bwd_baseline_benchmark(
     weights = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)
 
-    def dropout_layernorm_fwd():
-        return torch.nn.functional.layer_norm(
-            input2 + torch.nn.functional.dropout(input1, p=dropout_p),
-            normalized_shape=input1.shape[1:],
-            weight=weights,
-            bias=bias,
-        )
-
     # Compile the fwd fn for torchcompile
-    fwd_fn = {
-        "eager": dropout_layernorm_fwd,
-        "torchcompile": torch.compile(dropout_layernorm_fwd),
-    }
-    outputs = fwd_fn[executor]()
+    fwd_fn = with_executor(executor, dropout_layernorm)
+    fwd_inputs = [input1, input2, weights, bias, dropout_p]
+    outputs = fwd_fn(fwd_inputs)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(

diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py
@@ -8,9 +8,11 @@
     run_benchmark,
     clear_dynamo_cache,
     compute_total_iobytes,
+    with_executor,
 )
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .torch_ops import dropout_layernorm
 
 
 def dropout_layernorm_fwd_fusion(
@@ -73,17 +75,6 @@ def dropout_layernorm_fwd_fusion(
     fd.add_output(T10)
 
 
-def dropout_layernorm_fwd(
-    inputs: list,
-):  # [in_tensor1, in_tensor2, weights, bias, dropout_p]
-    return torch.nn.functional.layer_norm(
-        inputs[1] + torch.nn.functional.dropout(inputs[0], p=inputs[-1]),
-        normalized_shape=inputs[0].shape[1:],
-        weight=inputs[2],
-        bias=inputs[3],
-    )
-
-
 def dropout_layernorm_fwd_iobytes(size: tuple, dtype: torch.dtype):
     # Manual IOByte computation is required since nvFuser outputs differ from baseline outputs (output).
     nvf_inp_out = {
@@ -181,15 +172,12 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
         dropout_p,
     ]
 
-    benchmark_fn = {
-        "eager": dropout_layernorm_fwd,
-        "torchcompile": torch.compile(dropout_layernorm_fwd),
-    }
+    benchmark_fn = with_executor(executor, dropout_layernorm)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        benchmark_fn[executor],
+        benchmark_fn,
         inputs,
         iobytes=dropout_layernorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py
@@ -9,9 +9,11 @@
     clear_dynamo_cache,
     unary_bwd_torch,
     compute_total_iobytes,
+    with_executor,
 )
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .torch_ops import dropout_rmsnorm
 
 
 def dropout_rmsnorm_bwd_fusion(
@@ -186,16 +188,9 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark(
     grads = torch.randn(size, device="cuda", dtype=dtype)
     weights = torch.randn(size[1], device="cuda", dtype=dtype)
 
-    def dropout_rmsnorm_fwd():
-        x = input2 + torch.nn.functional.dropout(input1, p=dropout_p)
-        output = weights * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5)
-        return output
-
-    fwd_fn = {
-        "eager": dropout_rmsnorm_fwd,
-        "torchcompile": torch.compile(dropout_rmsnorm_fwd),
-    }
-    outputs = fwd_fn[executor]()
+    fwd_fn = with_executor(executor, dropout_rmsnorm)
+    fwd_inputs = [input1, input2, weights, dropout_p]
+    outputs = fwd_fn(fwd_inputs)
 
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py
@@ -8,9 +8,11 @@
     run_benchmark,
     clear_dynamo_cache,
     compute_total_iobytes,
+    with_executor,
 )
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .torch_ops import dropout_rmsnorm
 
 
 def dropout_rmsnorm_fwd_fusion(
@@ -80,12 +82,6 @@ def dropout_rmsnorm_fwd_fusion(
     fd.add_output(T28)
 
 
-def dropout_rmsnorm_fwd(inputs: list):
-    input1, input2, weights, dropout_p = inputs
-    x = input2 + torch.nn.functional.dropout(input1, p=dropout_p)
-    return weights * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5)
-
-
 def dropout_rmsnorm_fwd_iobytes(size: tuple, dtype: torch.dtype):
     # Manual IOByte computation is required since nvFuser input/outputs differ from baseline outputs (output).
     nvf_inp_out = {
@@ -165,15 +161,12 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
         dropout_p,
     ]
 
-    benchmark_fn = {
-        "eager": dropout_rmsnorm_fwd,
-        "torchcompile": torch.compile(dropout_rmsnorm_fwd),
-    }
+    benchmark_fn = with_executor(executor, dropout_rmsnorm)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        benchmark_fn[executor],
+        benchmark_fn,
         inputs,
         iobytes=dropout_rmsnorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py
@@ -4,10 +4,11 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch
+from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
+from .torch_ops import gelu
 
 
 def gelu_bwd_fusion(
@@ -103,14 +104,9 @@ def test_gelu_bwd_baseline_benchmark(
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     grads = torch.randn(size, device="cuda", dtype=dtype)
 
-    def gelu_fwd():
-        return torch.nn.functional.gelu(inputs + bias, approximate="tanh")
-
-    fwd_fn = {
-        "eager": gelu_fwd,
-        "torchcompile": torch.compile(gelu_fwd),
-    }
-    outputs = fwd_fn[executor]()
+    fwd_fn = with_executor(executor, gelu)
+    fwd_inputs = [inputs, bias]
+    outputs = fwd_fn(fwd_inputs)
 
     run_benchmark(
         benchmark,

diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_dynamo_cache
+from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
@@ -121,14 +121,11 @@ def test_gelu_bwd_reduction_baseline_benchmark(
     grads = torch.randn(size, device="cuda", dtype=dtype)
     eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh")
 
-    benchmark_fn = {
-        "eager": gelu_bwd_reduction_torch,
-        "torchcompile": torch.compile(gelu_bwd_reduction_torch),
-    }
+    benchmark_fn = with_executor(executor, gelu_bwd_reduction_torch)
 
     run_benchmark(
         benchmark,
-        benchmark_fn[executor],
+        benchmark_fn,
         [eager_output, grads, inputs, reduction_axis],
         iobytes=gelu_bwd_reduction_iobytes(size, dtype, reduction_axis),
     )
diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py
@@ -4,9 +4,10 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_dynamo_cache
+from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .torch_ops import gelu
 
 
 def gelu_fwd_fusion(
@@ -41,10 +42,6 @@ def gelu_fwd_fusion(
     fd.add_output(T10)
 
 
-def gelu_fwd_fn(inputs: list):  # [in_tensor, bias]
-    return torch.nn.functional.gelu(inputs[0] + inputs[1], approximate="tanh")
-
-
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_gelu_fwd_nvf_benchmark(
@@ -61,7 +58,7 @@ def test_gelu_fwd_nvf_benchmark(
     with FusionDefinition() as fd:
         gelu_fwd_fusion(fd, torch_dtype_to_nvfuser_dtype(dtype))
     if not disable_validation:
-        eager_output = gelu_fwd_fn(inputs)
+        eager_output = gelu(inputs)
         fd.validate(inputs, [eager_output])
     if not disable_benchmarking:
         run_benchmark(benchmark, fd.execute, inputs)
@@ -83,10 +80,7 @@ def test_gelu_fwd_baseline_benchmark(
         torch.ones(size[-1], device="cuda", dtype=dtype),  # bias
     ]
 
-    benchmark_fn = {
-        "eager": gelu_fwd_fn,
-        "torchcompile": torch.compile(gelu_fwd_fn),
-    }
+    benchmark_fn = with_executor(executor, gelu)
 
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
-    run_benchmark(benchmark, benchmark_fn[executor], inputs)
+    run_benchmark(benchmark, benchmark_fn, inputs)
diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py
@@ -4,10 +4,8 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_dynamo_cache
+from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-import thunder
-from thunder.executors.nvfuserex import nvfuserex
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
 
@@ -145,15 +143,10 @@ def test_groupnorm_fwd_baseline_benchmark(
     bias = torch.randn(C, device="cuda", dtype=dtype)
     num_groups = get_n_groups(C)
 
-    benchmark_fn = {
-        "eager": groupnorm_fwd,
-        "torchcompile": torch.compile(groupnorm_fwd),
-        "thunder": thunder.jit(
-            groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
-        ),
-    }
+    benchmark_fn = with_executor(executor, groupnorm_fwd)
+
     run_benchmark(
         benchmark,
-        benchmark_fn[executor],
+        benchmark_fn,
         [x, weight, bias, num_groups],
     )