Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/NVIDIA/Fuser into move_exec…
Browse files Browse the repository at this point in the history
…utor
  • Loading branch information
csarofeen committed Aug 4, 2024
2 parents 8ccbaea + 346e51c commit 033486e
Show file tree
Hide file tree
Showing 36 changed files with 558 additions and 258 deletions.
6 changes: 5 additions & 1 deletion benchmarks/python/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .global_params import PROMOTE_DTYPES
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
import torch
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch, clear_dynamo_cache
import numpy as np


Expand Down Expand Up @@ -441,6 +441,8 @@ def norm_fwd_baseline_benchmark(
norm: str,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

assert norm in ["batch_norm", "instance_norm"], NotImplementedError

Expand Down Expand Up @@ -474,6 +476,8 @@ def norm_bwd_baseline_benchmark(
norm: str,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

assert norm in ["batch_norm", "instance_norm"], NotImplementedError

Expand Down
4 changes: 4 additions & 0 deletions benchmarks/python/test_dropout_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .core import (
run_benchmark,
clear_cuda_cache,
clear_dynamo_cache,
unary_bwd_torch,
compute_total_iobytes,
)
Expand Down Expand Up @@ -201,6 +202,9 @@ def test_dropout_layernorm_bwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

dropout_p = 0.2
input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
Expand Down
9 changes: 8 additions & 1 deletion benchmarks/python/test_dropout_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, compute_total_iobytes
from .core import (
run_benchmark,
clear_cuda_cache,
clear_dynamo_cache,
compute_total_iobytes,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -167,6 +172,8 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

dropout_p = 0.2
inputs = [
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/python/test_dropout_rmsnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .core import (
run_benchmark,
clear_cuda_cache,
clear_dynamo_cache,
unary_bwd_torch,
compute_total_iobytes,
)
Expand Down Expand Up @@ -181,6 +182,8 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
dropout_p = 0.2
input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
Expand Down
9 changes: 8 additions & 1 deletion benchmarks/python/test_dropout_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, compute_total_iobytes
from .core import (
run_benchmark,
clear_cuda_cache,
clear_dynamo_cache,
compute_total_iobytes,
)
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -153,6 +158,8 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
dropout_p = 0.2

inputs = [
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/python/test_gelu_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -100,6 +100,8 @@ def test_gelu_bwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
bias = torch.ones(size[-1], device="cuda", dtype=dtype)
grads = torch.randn(size, device="cuda", dtype=dtype)
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/python/test_gelu_bwd_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -117,6 +117,8 @@ def test_gelu_bwd_reduction_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
bias = torch.ones(size[-1], device="cuda", dtype=dtype)
grads = torch.randn(size, device="cuda", dtype=dtype)
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/python/test_gelu_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -79,6 +79,8 @@ def test_gelu_fwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
inputs = [
torch.randn(size, device="cuda", dtype=dtype, requires_grad=True), # in_tensor
torch.ones(size[-1], device="cuda", dtype=dtype), # bias
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/python/test_groupnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -131,6 +131,8 @@ def test_groupnorm_fwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
N, C, H, W = size
x = torch.randn(size, device="cuda", dtype=dtype)
weight = torch.randn(C, device="cuda", dtype=dtype)
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_huggingface_attn_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
import torch
from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -119,7 +119,8 @@ def test_huggingface_attn_bwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
batch_size, seq_len, nh, n_embd = size
dropout_p = 0.2
inputs = torch.randn(
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_huggingface_attn_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -147,7 +147,8 @@ def test_huggingface_attn_fwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
batch_size, seq_len, nh, n_embd = size
dropout_p = 0.2
inputs = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)
Expand Down
6 changes: 5 additions & 1 deletion benchmarks/python/test_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -157,6 +157,10 @@ def test_layernorm_bwd_baseline_benchmark(
dtype: torch.dtype,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
grads = torch.randn(*size, device="cuda", dtype=dtype)
weights = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_nanogpt_attn_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
import torch
from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -135,7 +135,8 @@ def test_nanogpt_attn_bwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
batch_size, seq_len, nh, n_embd = size
dropout_p = 0.2
inputs = torch.randn(
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_nanogpt_attn_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_attn_inputs, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -148,7 +148,8 @@ def test_nanogpt_attn_fwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
batch_size, seq_len, nh, n_embd = size
dropout_p = 0.2
inputs = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_pointwise_mul.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -62,7 +62,8 @@ def test_pointwise_mul_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
input = torch.randn(size, device="cuda", dtype=dtype)
# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -67,7 +67,8 @@ def test_reduction_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
input = torch.randn(size, device="cuda", dtype=dtype)
# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
run_benchmark(
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_reduction_epilogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES

Expand Down Expand Up @@ -80,7 +80,8 @@ def test_reduction_epilogue_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
x = torch.randn(size, device="cuda", dtype=dtype)
epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype)
# Inputs and outputs are same as nvFuser, no need for manual IOByte computation
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_rmsnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -92,7 +92,8 @@ def test_rmsnorm_bwd_nvf_benchmark(
eps: float = 1e-5,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
grads = torch.randn(size, device="cuda", dtype=dtype)
weights = torch.randn(size[1], device="cuda", dtype=dtype, requires_grad=True)
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -98,7 +98,8 @@ def test_rmsnorm_fwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
inputs = torch.randn(size, device="cuda", dtype=dtype)
weights = torch.randn(size[1], device="cuda", dtype=dtype)

Expand Down
5 changes: 3 additions & 2 deletions benchmarks/python/test_scale_bias_relu_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_cuda_cache, unary_bwd_torch
from .core import run_benchmark, clear_cuda_cache, clear_dynamo_cache, unary_bwd_torch
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
import numpy as np
Expand Down Expand Up @@ -91,7 +91,8 @@ def test_sbr_bwd_baseline_benchmark(
compile: bool,
):
clear_cuda_cache()

if compile:
clear_dynamo_cache()
inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
grads = torch.randn(*size, device="cuda", dtype=dtype)
scale = torch.ones(size[-1], device="cuda", dtype=dtype)
Expand Down
Loading

0 comments on commit 033486e

Please sign in to comment.