Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Internal Assert Failed #3497

Open
t-vi opened this issue Nov 29, 2024 · 1 comment
Open

Internal Assert Failed #3497

t-vi opened this issue Nov 29, 2024 · 1 comment
Labels

Comments

@t-vi
Copy link
Contributor

t-vi commented Nov 29, 2024

I am not entirely sure if this is a valid fusion definition.
The error happens when I run HF Llama 3.2 1b in training mode (passing input_ids and labels), with NVFuser followed by a TorchCompile executor (which is nonstandard).
Please do close if you think the fusion definition is at fault, I'm just sending it because the error said internal assert rather than bad input.

An error occurred while executing nvFuser FusionDefinition 38.
If you believe this is a bug or need assistance, please file an issue at https://github.com/NVIDIA/Fuser/issues/new
Here's a script to reproduce the error:

# CUDA devices:
#  0: NVIDIA L40S
# torch version: 2.5.1+cu124
# cuda version: 12.4
# nvfuser version: 0.2.22+gitc14d418
import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id38(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[1, 32, 2048], contiguity=[None, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
    T1 = fd.define_tensor(shape=[1, 2048, 2048], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T2 = fd.define_tensor(shape=[1, 2048, 512], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T3 = fd.define_tensor(shape=[1, 2048, 512], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T4 = fd.ops.permute(T0, dims=[0, 2, 1])
    T5 = fd.ops.cat([T4, T4], dim=-1, manual_padding=0)
    T6 = fd.ops.cos(T5)
    T7 = fd.ops.sin(T5)
    S8 = fd.define_scalar(1.00000, dtype=DataType.Double)
    T9 = fd.ops.mul(T6, S8)
    S10 = fd.define_scalar(1.00000, dtype=DataType.Double)
    T11 = fd.ops.mul(T7, S10)
    T12 = fd.ops.cast(T9, dtype=DataType.BFloat16)
    T13 = fd.ops.cast(T11, dtype=DataType.BFloat16)
    T19 = fd.ops.reshape(T1, new_shape=[1, 2048, 32, 64])
    T20 = fd.ops.permute(T19, dims=[0, 2, 1, 3])
    T26 = fd.ops.reshape(T2, new_shape=[1, 2048, 8, 64])
    T27 = fd.ops.permute(T26, dims=[0, 2, 1, 3])
    T33 = fd.ops.reshape(T3, new_shape=[1, 2048, 8, 64])
    T34 = fd.ops.permute(T33, dims=[0, 2, 1, 3])
    T40 = fd.ops.broadcast_in_dim(T12, shape=[1, 1, 2048, 64], broadcast_dims=[0, 2, 3])
    T46 = fd.ops.broadcast_in_dim(T13, shape=[1, 1, 2048, 64], broadcast_dims=[0, 2, 3])
    T52 = fd.ops.broadcast_in_dim(T40, shape=[1, 32, 2048, 64], broadcast_dims=[0, 1, 2, 3])
    T53 = fd.ops.cast(T20, dtype=DataType.Float)
    T54 = fd.ops.cast(T52, dtype=DataType.Float)
    T55 = fd.ops.mul(T53, T54)
    T71 = fd.ops.slice(T20, start_indices=[0, 0, 0, 0], end_indices=[1, 32, 2048, 32], strides=[1, 1, 1, 1], manual_normalization=0)
    T87 = fd.ops.slice(T20, start_indices=[0, 0, 0, 32], end_indices=[1, 32, 2048, 64], strides=[1, 1, 1, 1], manual_normalization=0)
    T88 = fd.ops.cast(T87, dtype=DataType.Float)
    T89 = fd.ops.neg(T88)
    T90 = fd.ops.cast(T89, dtype=DataType.BFloat16)
    T91 = fd.ops.cat([T90, T71], dim=-1, manual_padding=0)
    T97 = fd.ops.broadcast_in_dim(T46, shape=[1, 32, 2048, 64], broadcast_dims=[0, 1, 2, 3])
    T98 = fd.ops.cast(T91, dtype=DataType.Float)
    T99 = fd.ops.cast(T97, dtype=DataType.Float)
    T100 = fd.ops.mul(T98, T99)
    T101 = fd.ops.add(T55, T100)
    T102 = fd.ops.cast(T101, dtype=DataType.BFloat16)
    T108 = fd.ops.broadcast_in_dim(T40, shape=[1, 8, 2048, 64], broadcast_dims=[0, 1, 2, 3])
    T109 = fd.ops.cast(T27, dtype=DataType.Float)
    T110 = fd.ops.cast(T108, dtype=DataType.Float)
    T111 = fd.ops.mul(T109, T110)
    T127 = fd.ops.slice(T27, start_indices=[0, 0, 0, 0], end_indices=[1, 8, 2048, 32], strides=[1, 1, 1, 1], manual_normalization=0)
    T143 = fd.ops.slice(T27, start_indices=[0, 0, 0, 32], end_indices=[1, 8, 2048, 64], strides=[1, 1, 1, 1], manual_normalization=0)
    T144 = fd.ops.cast(T143, dtype=DataType.Float)
    T145 = fd.ops.neg(T144)
    T146 = fd.ops.cast(T145, dtype=DataType.BFloat16)
    T147 = fd.ops.cat([T146, T127], dim=-1, manual_padding=0)
    T153 = fd.ops.broadcast_in_dim(T46, shape=[1, 8, 2048, 64], broadcast_dims=[0, 1, 2, 3])
    T154 = fd.ops.cast(T147, dtype=DataType.Float)
    T155 = fd.ops.cast(T153, dtype=DataType.Float)
    T156 = fd.ops.mul(T154, T155)
    T157 = fd.ops.add(T111, T156)
    T158 = fd.ops.cast(T157, dtype=DataType.BFloat16)
    T165 = fd.ops.broadcast_in_dim(T158, shape=[1, 8, 1, 2048, 64], broadcast_dims=[0, 1, 3, 4])
    T172 = fd.ops.broadcast_in_dim(T165, shape=[1, 8, 4, 2048, 64], broadcast_dims=[0, 1, 2, 3, 4])
    T178 = fd.ops.reshape(T172, new_shape=[1, 32, 2048, 64])
    T185 = fd.ops.broadcast_in_dim(T34, shape=[1, 8, 1, 2048, 64], broadcast_dims=[0, 1, 3, 4])
    T192 = fd.ops.broadcast_in_dim(T185, shape=[1, 8, 4, 2048, 64], broadcast_dims=[0, 1, 2, 3, 4])
    T198 = fd.ops.reshape(T192, new_shape=[1, 32, 2048, 64])
    fd.add_output(T34)
    fd.add_output(T54)
    fd.add_output(T99)
    fd.add_output(T102)
    fd.add_output(T110)
    fd.add_output(T155)
    fd.add_output(T158)
    fd.add_output(T178)
    fd.add_output(T198)

with FusionDefinition() as fd:
    nvfuser_fusion_id38(fd)

inputs = [
    torch.testing.make_tensor((1, 32, 2048), dtype=torch.float32, device='cuda:0'),
    torch.testing.make_tensor((1, 2048, 2048), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((1, 2048, 512), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((1, 2048, 512), dtype=torch.bfloat16, device='cuda:0'),
]
fd.execute(inputs)
Traceback (most recent call last):
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/__init__.py", line 181, in execute
    results = self._execute(
RuntimeError:  INTERNAL ASSERT FAILED at "[/workspace/Fuser/csrc/runtime/fusion_kernel_runtime.cpp](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/workspace/Fuser/csrc/runtime/fusion_kernel_runtime.cpp)":358, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Detected exception while compiling fusion segments in parallel. Error messages from all threads are printed below.

Error from segmentation group 8:  INTERNAL ASSERT FAILED at "/workspace/Fuser/csrc/transform_iter.cpp":546, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Error during replay, a transformation was called that conflicts with an rfactor call.
Exception raised from BestEffortReplay at /workspace/Fuser/csrc/transform_iter.cpp:546 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x9a (0x7f386d38030e in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::string const&) + 0x3e (0x7f386d73096e in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #2: <unknown function> + 0x9105cb (0x7f386db515cb in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #3: nvfuser::BestEffortReplay::replayCasP(nvfuser::TensorView const*, nvfuser::TensorView const*, long, nvfuser::LogicalDomainMap const&, bool, bool, bool) + 0x672 (0x7f386db549e2 in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #4: nvfuser::TransformReplay::replayCasP(nvfuser::TensorView const*, nvfuser::TensorView const*, long, nvfuser::LogicalDomainMap const&, nvfuser::TransformReplayOptions) + 0x1f9 (0x7f386db5ed79 in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #5: nvfuser::TransformReplay::replayCasP(nvfuser::TensorView const*, nvfuser::TensorView const*, long, nvfuser::TransformReplayOptions) + 0x3f (0x7f386db6063f in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #6: nvfuser::TransformPropagator::propagateP2C(nvfuser::TensorView*, nvfuser::TensorView*) + 0x133 (0x7f386db60783 in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #7: nvfuser::MaxInfoSpanningTree::traverse(nvfuser::MaxInfoSpanningTree::Propagator*) + 0xbe (0x7f386daf02ce in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #8: <unknown function> + 0x87c32b (0x7f386dabd32b in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #9: <unknown function> + 0x87d4fb (0x7f386dabe4fb in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #10: <unknown function> + 0x7f8174 (0x7f386da39174 in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #11: <unknown function> + 0x7f834b (0x7f386da3934b in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so))
frame #12: c10::ThreadPool::main_loop(unsigned long) + 0x2b3 (0x7f3a80528dd3 in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/lib/libc10.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/lib/libc10.so))
frame #13: <unknown function> + 0x7bf27 (0x7f3a80528f27 in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/lib/libc10.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/lib/libc10.so))
frame #14: <unknown function> + 0x145c0 (0x7f3a9018e5c0 in [/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/lib/libtorch.so](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/lib/libtorch.so))
frame #15: <unknown function> + 0x8609 (0x7f3a959ab609 in [/lib/x86_64-linux-gnu/libpthread.so.0](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/lib/x86_64-linux-gnu/libpthread.so.0))
frame #16: clone + 0x43 (0x7f3a95776353 in [/lib/x86_64-linux-gnu/libc.so.6](https://jupyterlab-01j21mv3p2174nyertkjd93cbf.studio.lightning.ai/lib/x86_64-linux-gnu/libc.so.6))

Python repro I'm using:

import torch
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
    
config = LlamaConfig.from_pretrained('meta-llama/Llama-3.2-1B')
#config.num_hidden_layers = 1

with torch.device("cuda"):
    model = LlamaForCausalLM(config).to(torch.bfloat16)

args = dict(
    #cache_positions=torch.arange(6, device="cuda"),
    input_ids=torch.ones(1, 2048, dtype=torch.int64, device="cuda"),
    labels=torch.ones(1, 2048, dtype=torch.int64, device="cuda"),
    #attention_mask=torch.ones(1, 6, dtype=torch.int64, device="cuda"),
    #inputs_embeds=None,
    #use_cache=True,
    return_dict=True,
)

import thunder
from thunder.transforms.cudagraph import CUDAGraphTransform
jm = thunder.jit(model,
                executors=('apex', 'cudnn', 'sdpa', 'nvfuser', 'torchcompile'),
                ) #, transforms=(CUDAGraphTransform(),))

model(**args);
res = jm(**args);
res.loss.backward()

of course, it should also work with specifying the config directly instead of using from_pretrained, e.g.

LLAMA_3_2_1B_CFG = {
    "architectures": ["LlamaForCausalLM"],
    "attention_bias": False,
    "attention_dropout": 0.0,
    "bos_token_id": 128000,
    "eos_token_id": 128001,
    "head_dim": 64,
    "hidden_act": "silu",
    "hidden_size": 2048,
    "initializer_range": 0.02,
    "intermediate_size": 8192,
    "max_position_embeddings": 131072,
    "mlp_bias": False,
    "model_type": "llama",
    "num_attention_heads": 32,
    "num_hidden_layers": 16,
    "num_key_value_heads": 8,
    "pretraining_tp": 1,
    "rms_norm_eps": 1e-05,
    "rope_scaling": {
        "factor": 32.0,
        "high_freq_factor": 4.0,
        "low_freq_factor": 1.0,
        "original_max_position_embeddings": 8192,
        "rope_type": "llama3",
    },
    "rope_theta": 500000.0,
    "tie_word_embeddings": True,
    "torch_dtype": "bfloat16",
    "transformers_version": "4.45.0.dev0",
    "use_cache": True,
    "vocab_size": 128256,
    "_commit_hash": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
}


config = LlamaConfig(**LLAMA_3_2_1B_CFG)

with torch.device("cuda"):
    model = LlamaForCausalLM(config).to(torch.bfloat16)
@kevinstephano
Copy link
Collaborator

I think this is a duplicate of #3505.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants