Merge branch 'main' of https://github.com/NVIDIA/Fuser into move_exec…

…utor
NVIDIA · Aug 4, 2024 · 033486e · 033486e
2 parents 8ccbaea + 346e51c
commit 033486e
Show file tree

Hide file tree

Showing 36 changed files with 558 additions and 258 deletions.
diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
@@ -5,7 +5,7 @@
 from .global_params import PROMOTE_DTYPES
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 import torch
-from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
+from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch, clear_dynamo_cache
 import numpy as np
 
 
@@ -441,6 +441,8 @@ def norm_fwd_baseline_benchmark(
     norm: str,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
 
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
 
@@ -474,6 +476,8 @@ def norm_bwd_baseline_benchmark(
     norm: str,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
 
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
 

diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py
@@ -7,6 +7,7 @@
 from .core import (
     run_benchmark,
     clear_cuda_cache,
+    clear_dynamo_cache,
     unary_bwd_torch,
     compute_total_iobytes,
 )
@@ -201,6 +202,9 @@ def test_dropout_layernorm_bwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
+
     dropout_p = 0.2
     input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)

diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py
@@ -4,7 +4,12 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, compute_total_iobytes
+from .core import (
+    run_benchmark,
+    clear_cuda_cache,
+    clear_dynamo_cache,
+    compute_total_iobytes,
+)
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -167,6 +172,8 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
 
     dropout_p = 0.2
     inputs = [

diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py
@@ -7,6 +7,7 @@
 from .core import (
     run_benchmark,
     clear_cuda_cache,
+    clear_dynamo_cache,
     unary_bwd_torch,
     compute_total_iobytes,
 )
@@ -181,6 +182,8 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
     dropout_p = 0.2
     input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)

diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py
@@ -4,7 +4,12 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, compute_total_iobytes
+from .core import (
+    run_benchmark,
+    clear_cuda_cache,
+    clear_dynamo_cache,
+    compute_total_iobytes,
+)
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -153,6 +158,8 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
     dropout_p = 0.2
 
     inputs = [

diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
@@ -100,6 +100,8 @@ def test_gelu_bwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     grads = torch.randn(size, device="cuda", dtype=dtype)

diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
@@ -117,6 +117,8 @@ def test_gelu_bwd_reduction_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     grads = torch.randn(size, device="cuda", dtype=dtype)

diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -79,6 +79,8 @@ def test_gelu_fwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
     inputs = [
         torch.randn(size, device="cuda", dtype=dtype, requires_grad=True),  # in_tensor
         torch.ones(size[-1], device="cuda", dtype=dtype),  # bias

diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -131,6 +131,8 @@ def test_groupnorm_fwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
     N, C, H, W = size
     x = torch.randn(size, device="cuda", dtype=dtype)
     weight = torch.randn(C, device="cuda", dtype=dtype)

diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
 import torch
 from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -119,7 +119,8 @@ def test_huggingface_attn_bwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
     inputs = torch.randn(

diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -147,7 +147,8 @@ def test_huggingface_attn_fwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
     inputs = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)

diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
@@ -157,6 +157,10 @@ def test_layernorm_bwd_baseline_benchmark(
     dtype: torch.dtype,
     compile: bool,
 ):
+    clear_cuda_cache()
+    if compile:
+        clear_dynamo_cache()
+
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(*size, device="cuda", dtype=dtype)
     weights = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)

diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
 import torch
 from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -135,7 +135,8 @@ def test_nanogpt_attn_bwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
     inputs = torch.randn(

diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -148,7 +148,8 @@ def test_nanogpt_attn_fwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
     inputs = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)

diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -62,7 +62,8 @@ def test_pointwise_mul_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(

diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -67,7 +67,8 @@ def test_reduction_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(

diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py
@@ -5,7 +5,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 
@@ -80,7 +80,8 @@ def test_reduction_epilogue_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     x = torch.randn(size, device="cuda", dtype=dtype)
     epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype)
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation

diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
@@ -92,7 +92,8 @@ def test_rmsnorm_bwd_nvf_benchmark(
     eps: float = 1e-5,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
     weights = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)

diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
@@ -98,7 +98,8 @@ def test_rmsnorm_fwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype)
     weights = torch.randn(size[1], device="cuda", dtype=dtype)
 

diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py
@@ -4,7 +4,7 @@
 import pytest
 from nvfuser import FusionDefinition, DataType
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
-from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
+from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
 import numpy as np
@@ -91,7 +91,8 @@ def test_sbr_bwd_baseline_benchmark(
     compile: bool,
 ):
     clear_cuda_cache()
-
+    if compile:
+        clear_dynamo_cache()
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(*size, device="cuda", dtype=dtype)
     scale = torch.ones(size[-1], device="cuda", dtype=dtype)