Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark failure in lit-gpt falcon-7b model #2264

Closed
xwang233 opened this issue May 16, 2024 · 2 comments
Closed

Benchmark failure in lit-gpt falcon-7b model #2264

xwang233 opened this issue May 16, 2024 · 2 comments

Comments

@xwang233
Copy link
Collaborator

reproduce:

pjnl-20240516, H100

root@7a32d69c1587:/opt/pytorch/lightning-thunder# NVFUSER_DISABLE=parallel_compile torchrun --nproc_per_node 2 --nnodes 1 thunder/benchmarks/benchmark_litgpt.py --model_name falcon-7b     --distributed_mode ddp     --compile thunder_cudnn

(part of the ) stacktrace:

An error occurred while executing nvFuser FusionDefinition 7.
If you believe this is a bug or need assistance, please file an issue at https://github.com/NVIDIA/Fuser/issues/new
Here's a script to reproduce the error:

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id7(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 1, 0])
    T1 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 1, 0])
    T2 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T3 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T4 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T5 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T6 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T7 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T8 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T9 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T10 = fd.define_tensor(shape=[1, -1, -1], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T11 = fd.ops.cast(T10, dtype=DataType.Float)
    S12 = fd.define_scalar(1.41421, dtype=DataType.Double)
    S13 = fd.ops.reciprocal(S12)
    T14 = fd.ops.mul(T11, S13)
    T15 = fd.ops.erf(T14)
    S16 = fd.define_scalar(0.500000, dtype=DataType.Double)
    T17 = fd.ops.mul(S16, T15)
    S18 = fd.define_scalar(0.500000, dtype=DataType.Double)
    T19 = fd.ops.add(S18, T17)
    T20 = fd.ops.mul(T11, T19)
    T21 = fd.ops.cast(T20, dtype=DataType.BFloat16)
    T22 = fd.ops.cast(T4, dtype=DataType.Float)
    T23 = fd.ops.neg(T22)
    T24 = fd.ops.cast(T23, dtype=DataType.BFloat16)
    T25 = fd.ops.cat([T24, T3], dim=-1)
    T26 = fd.ops.cast(T2, dtype=DataType.Float)
    T27 = fd.ops.mul(T26, T0)
    T28 = fd.ops.cast(T25, dtype=DataType.Float)
    T29 = fd.ops.mul(T28, T1)
    T30 = fd.ops.add(T27, T29)
    T31 = fd.ops.cast(T30, dtype=DataType.BFloat16)
    T32 = fd.ops.cast(T7, dtype=DataType.Float)
    T33 = fd.ops.neg(T32)
    T34 = fd.ops.cast(T33, dtype=DataType.BFloat16)
    T35 = fd.ops.cat([T34, T6], dim=-1)
    T36 = fd.ops.cast(T5, dtype=DataType.Float)
    T37 = fd.ops.mul(T36, T0)
    T38 = fd.ops.cast(T35, dtype=DataType.Float)
    T39 = fd.ops.mul(T38, T1)
    T40 = fd.ops.add(T37, T39)
    T41 = fd.ops.cast(T40, dtype=DataType.BFloat16)
    T42 = fd.ops.cat([T31, T8], dim=-1)
    T43 = fd.ops.cat([T41, T9], dim=-1)
    fd.add_output(T42)
    fd.add_output(T43)
    fd.add_output(T21)

with FusionDefinition() as fd:
    nvfuser_fusion_id7(fd)

inputs = [
    torch.randn((131072,), dtype=torch.float32, device='cuda:1').as_strided((1, 71, 2048, 64), (131072, 0, 64, 1)),
    torch.randn((131072,), dtype=torch.float32, device='cuda:1').as_strided((1, 71, 2048, 64), (131072, 0, 64, 1)),
    torch.randn((9568128,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 64), (4544, 64, 4672, 1)),
    torch.randn((9568096,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (4544, 64, 4672, 1)),
    torch.randn((9568096,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (4544, 64, 4672, 1)),
    torch.randn((9563648,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 64), (0, 0, 4672, 1)),
    torch.randn((9563616,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (0, 0, 4672, 1)),
    torch.randn((9563616,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (0, 0, 4672, 1)),
    torch.randn((0,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 0), (4544, 64, 4672, 1)),
    torch.randn((0,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 0), (0, 0, 4672, 1)),
    torch.randn((37224448,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 2048, 18176), (37224448, 18176, 1)),
]
fd.execute(inputs)


Traceback (most recent call last):
  File "/opt/pytorch/nvfuser/nvfuser/__init__.py", line 139, in execute
    result = self._execute(
RuntimeError: _result == CUDA_SUCCESS INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/executor_utils.cpp":888, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS failed with error an illegal memory access was encountered
Exception raised from invoke at /opt/pytorch/nvfuser/csrc/executor_utils.cpp:888 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xf3 (0x7fb5375c7555 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x53 (0x7fb5378c5823 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #2: <unknown function> + 0x455e42 (0x7fb5378fae42 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #3: <unknown function> + 0x4580e6 (0x7fb5378fd0e6 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #4: nvfuser::FusionExecutor::compileFusion(nvfuser::Fusion*, nvfuser::KernelArgumentHolder const&, nvfuser::LaunchParams const&, nvfuser::CompileParams, nvfuser::ScheduleHeuristic, long, long, long, long) + 0x14a9 (0x7fb5378ddec9 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #5: <unknown function> + 0x5df922 (0x7fb537a84922 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #6: nvfuser::FusionKernelRuntime::compileFusionParallel(nvfuser::KernelArgumentHolder) + 0x447 (0x7fb537a8c307 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #7: nvfuser::FusionExecutorCache::runFusionWithInputs(c10::ArrayRef<c10::IValue> const&, std::optional<nvfuser::PrimDataType>, std::optional<signed char>) + 0xad3 (0x7fb537a97f23 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #8: nvfuser::python_frontend::FusionDefinition::execute(c10::ArrayRef<c10::IValue> const&, bool, bool, std::optional<signed char>) const + 0x3c8 (0x7fb537c76488 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #9: <unknown function> + 0x18f7f5 (0x7fb5376347f5 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #10: <unknown function> + 0x203812 (0x7fb5376a8812 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #11: <unknown function> + 0x28b870 (0x7fb537730870 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #12: <unknown function> + 0x15a10e (0x55f83485510e in /usr/bin/python)
frame #13: _PyObject_MakeTpCall + 0x25b (0x55f83484ba7b in /usr/bin/python)
frame #14: <unknown function> + 0x168acb (0x55f834863acb in /usr/bin/python)
frame #15: _PyEval_EvalFrameDefault + 0x198c (0x55f83483f53c in /usr/bin/python)
frame #16: <unknown function> + 0x16893e (0x55f83486393e in /usr/bin/python)
frame #17: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #18: _PyObject_FastCallDictTstate + 0xc4 (0x55f83484ac14 in /usr/bin/python)
frame #19: _PyObject_Call_Prepend + 0xc1 (0x55f8348608d1 in /usr/bin/python)
frame #20: <unknown function> + 0x280700 (0x55f83497b700 in /usr/bin/python)
frame #21: _PyObject_MakeTpCall + 0x25b (0x55f83484ba7b in /usr/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x64e6 (0x55f834844096 in /usr/bin/python)
frame #23: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #24: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #25: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #26: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #27: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #28: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #29: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #30: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #31: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #32: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #33: <unknown function> + 0x16893e (0x55f83486393e in /usr/bin/python)
frame #34: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #35: <unknown function> + 0x16893e (0x55f83486393e in /usr/bin/python)
frame #36: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #37: _PyObject_FastCallDictTstate + 0xc4 (0x55f83484ac14 in /usr/bin/python)
frame #38: _PyObject_Call_Prepend + 0x5c (0x55f83486086c in /usr/bin/python)
frame #39: <unknown function> + 0x280700 (0x55f83497b700 in /usr/bin/python)
frame #40: _PyObject_MakeTpCall + 0x25b (0x55f83484ba7b in /usr/bin/python)
frame #41: _PyEval_EvalFrameDefault + 0x64e6 (0x55f834844096 in /usr/bin/python)
frame #42: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #43: _PyEval_EvalFrameDefault + 0x6bd (0x55f83483e26d in /usr/bin/python)
frame #44: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #45: _PyEval_EvalFrameDefault + 0x8ac (0x55f83483e45c in /usr/bin/python)
frame #46: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #47: PyObject_Call + 0x122 (0x55f834864492 in /usr/bin/python)
frame #48: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #49: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #50: _PyEval_EvalFrameDefault + 0x6bd (0x55f83483e26d in /usr/bin/python)
frame #51: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #52: _PyEval_EvalFrameDefault + 0x6bd (0x55f83483e26d in /usr/bin/python)
frame #53: <unknown function> + 0x13f9c6 (0x55f83483a9c6 in /usr/bin/python)
frame #54: PyEval_EvalCode + 0x86 (0x55f834930256 in /usr/bin/python)
frame #55: <unknown function> + 0x260108 (0x55f83495b108 in /usr/bin/python)
frame #56: <unknown function> + 0x2599cb (0x55f8349549cb in /usr/bin/python)
frame #57: <unknown function> + 0x25fe55 (0x55f83495ae55 in /usr/bin/python)
frame #58: _PyRun_SimpleFileObject + 0x1a8 (0x55f83495a338 in /usr/bin/python)
frame #59: _PyRun_AnyFileObject + 0x43 (0x55f834959f83 in /usr/bin/python)
frame #60: Py_RunMain + 0x2be (0x55f83494ca5e in /usr/bin/python)
frame #61: Py_BytesMain + 0x2d (0x55f83492302d in /usr/bin/python)
frame #62: <unknown function> + 0x29d90 (0x7fb829078d90 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #63: __libc_start_main + 0x80 (0x7fb829078e40 in /usr/lib/x86_64-linux-gnu/libc.so.6)

cc @tfogal @naoyam to assign

@kevinstephano
Copy link
Collaborator

I don't think this is an nvFuser issue. I re-ran the snippet and it passes. What is likely happening is that the nvFuser fusion is doing a Cuda Check that catches an error downstream of the issue.

@kevinstephano
Copy link
Collaborator

This issue should have been closed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants