From eba7955a198ee4c8f1e9e00e505a9b5993c0c14c Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 21 Nov 2024 01:41:21 +0000 Subject: [PATCH 1/2] refactor(vllm): one file for vllm --- aria/vllm/aria.py | 432 +++++++++++++++++++++++++++++++++++- aria/vllm/projector.py | 170 -------------- aria/vllm/vision_encoder.py | 94 -------- 3 files changed, 429 insertions(+), 267 deletions(-) delete mode 100644 aria/vllm/projector.py delete mode 100644 aria/vllm/vision_encoder.py diff --git a/aria/vllm/aria.py b/aria/vllm/aria.py index f6abe09..2013fa5 100644 --- a/aria/vllm/aria.py +++ b/aria/vllm/aria.py @@ -65,11 +65,437 @@ from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .projector import AriaProjector -from .vision_encoder import AriaVisionModel - logger = logging.get_logger(__name__) +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from torch.nn.init import trunc_normal_ +from transformers.activations import ACT2FN +from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig +from vllm.config import QuantizationConfig +from vllm.model_executor.models.idefics2_vision_model import Idefics2VisionTransformer + + +class AriaVisionConfig(Idefics2VisionConfig): + model_type = "aria_vision_model" + + +class IdentityOp(torch.nn.Module): + """ + An identity operation that returns the input unchanged. + + This can be used as a placeholder or to maintain architectural consistency + when a specific operation is not needed. + """ + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, x, *args, **kwargs): + return x + + +class AriaVisionTransformer(Idefics2VisionTransformer): + def __init__( + self, + config: AriaVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config, quant_config, prefix) + self.post_layernorm = IdentityOp() + + +class AriaVisionModel(nn.Module): + config_class = AriaVisionConfig + + def __init__( + self, + config: AriaVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: + super().__init__() + + self.vision_model = AriaVisionTransformer( + config, + quant_config, + prefix=f"{prefix}.vision_model", + ) + + def forward( + self, + pixel_values: torch.Tensor, + pixel_mask: Optional[torch.BoolTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]: + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + + vit_oup = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) + + image_atts = self._create_image_attention_mask(patch_attention_mask) + + return vit_oup, image_atts + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_model.config.patch_size, + step=self.vision_model.config.patch_size, + ).unfold( + dimension=2, + size=self.vision_model.config.patch_size, + step=self.vision_model.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def _create_image_attention_mask(self, patch_attention_mask): + if patch_attention_mask is None: + return None + + flattened_mask = patch_attention_mask.flatten(1) + return torch.logical_not(flattened_mask) + + +class FFN(nn.Module): + """ + Feed-Forward Network module. + + Args: + embed_dim (int): Input embedding dimension. + ff_dim (int): Hidden dimension of the feed-forward network. + output_dim (int): Output dimension. + """ + + def __init__(self, embed_dim, ff_dim, output_dim): + super().__init__() + self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False) + self.linear_out = nn.Linear(ff_dim, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class CrossAttention(nn.Module): + """ + Cross-Attention module. + + Args: + kv_dim (int): Dimension of key and value. + embed_dim (int): Embedding dimension. + num_heads (int): Number of attention heads. + drop_out_rate (float): Dropout rate. Default is 0. + """ + + def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): + super().__init__() + self.num_heads = num_heads + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) + self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False) + self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False) + + self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + self.linear = nn.Linear(embed_dim, embed_dim) + self.dropout = nn.Dropout(drop_out_rate) + + self.layer_norm = nn.LayerNorm(embed_dim) + self.ln_kv = nn.LayerNorm(kv_dim) + + def forward(self, x, hidden_states, attn_mask=None, add_residual=False): + """ + Forward pass of the CrossAttention module. + + Args: + x (torch.Tensor): Input tensor for key and value. + hidden_states (torch.Tensor): Input tensor for query. + attn_mask (torch.Tensor, optional): Attention mask. Default is None. + add_residual (bool): Whether to add residual connection. Default is False. + + Returns: + torch.Tensor: Output tensor after cross-attention. + """ + normed_hidden_states = self.layer_norm(hidden_states) + query = self.q_proj(normed_hidden_states).permute(1, 0, 2) + + x = self.ln_kv(x) + key = self.k_proj(x).permute(1, 0, 2) + value = self.v_proj(x).permute(1, 0, 2) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = attn_output.permute(1, 0, 2) + + if add_residual: + attn_output = hidden_states + self.dropout(self.linear(attn_output)) + else: + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs. + + Args: + patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers, + e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution. + embed_dim (int): Embedding dimension. + num_heads (int): Number of attention heads. + kv_dim (int): Dimension of key and value. + ff_dim (int): Hidden dimension of the feed-forward network. + output_dim (int): Output dimension. + norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm. + + Outputs: + A tensor with the shape of (batch_size, query_number, output_dim) + """ + + def __init__( + self, + patch_to_query_dict, + embed_dim, + num_heads, + kv_dim, + ff_dim, + output_dim, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.patch_to_query_dict = patch_to_query_dict + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.query = nn.Parameter( + torch.zeros(max(patch_to_query_dict.values()), self.embed_dim) + ) + + trunc_normal_(self.query, std=0.02) + + self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) + + self.ln_ffn = norm_layer(embed_dim) + self.ffn = FFN(embed_dim, ff_dim, output_dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x, attn_mask=None): + """ + Forward pass of the Projector module. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (torch.Tensor, optional): Attention mask. Default is None. + + Returns: + torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim). + """ + bs = x.shape[0] + queries = self.query.unsqueeze(0).repeat(bs, 1, 1) + + query_num = self.patch_to_query_dict.get(x.shape[1], None) + assert ( + query_num is not None + ), f"Query number for {x.shape[1]} patches is not provided" + + queries = queries[:, :query_num, :] + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(x, queries, attn_mask=attn_mask) + + out = self.ffn(self.ln_ffn(attention_out)) + + return out + + +class FFN(nn.Module): + """ + Feed-Forward Network module. + + Args: + embed_dim (int): Input embedding dimension. + ff_dim (int): Hidden dimension of the feed-forward network. + output_dim (int): Output dimension. + """ + + def __init__(self, embed_dim, ff_dim, output_dim): + super().__init__() + self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False) + self.linear_out = nn.Linear(ff_dim, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class CrossAttention(nn.Module): + """ + Cross-Attention module. + + Args: + kv_dim (int): Dimension of key and value. + embed_dim (int): Embedding dimension. + num_heads (int): Number of attention heads. + drop_out_rate (float): Dropout rate. Default is 0. + """ + + def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): + super().__init__() + self.num_heads = num_heads + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) + self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False) + self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False) + + self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + self.linear = nn.Linear(embed_dim, embed_dim) + self.dropout = nn.Dropout(drop_out_rate) + + self.layer_norm = nn.LayerNorm(embed_dim) + self.ln_kv = nn.LayerNorm(kv_dim) + + def forward(self, x, hidden_states, attn_mask=None, add_residual=False): + """ + Forward pass of the CrossAttention module. + + Args: + x (torch.Tensor): Input tensor for key and value. + hidden_states (torch.Tensor): Input tensor for query. + attn_mask (torch.Tensor, optional): Attention mask. Default is None. + add_residual (bool): Whether to add residual connection. Default is False. + + Returns: + torch.Tensor: Output tensor after cross-attention. + """ + normed_hidden_states = self.layer_norm(hidden_states) + query = self.q_proj(normed_hidden_states).permute(1, 0, 2) + + x = self.ln_kv(x) + key = self.k_proj(x).permute(1, 0, 2) + value = self.v_proj(x).permute(1, 0, 2) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = attn_output.permute(1, 0, 2) + + if add_residual: + attn_output = hidden_states + self.dropout(self.linear(attn_output)) + else: + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs. + + Args: + patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers, + e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution. + embed_dim (int): Embedding dimension. + num_heads (int): Number of attention heads. + kv_dim (int): Dimension of key and value. + ff_dim (int): Hidden dimension of the feed-forward network. + output_dim (int): Output dimension. + norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm. + + Outputs: + A tensor with the shape of (batch_size, query_number, output_dim) + """ + + def __init__( + self, + patch_to_query_dict, + embed_dim, + num_heads, + kv_dim, + ff_dim, + output_dim, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.patch_to_query_dict = patch_to_query_dict + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.query = nn.Parameter( + torch.zeros(max(patch_to_query_dict.values()), self.embed_dim) + ) + + trunc_normal_(self.query, std=0.02) + + self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) + + self.ln_ffn = norm_layer(embed_dim) + self.ffn = FFN(embed_dim, ff_dim, output_dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x, attn_mask=None): + """ + Forward pass of the Projector module. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (torch.Tensor, optional): Attention mask. Default is None. + + Returns: + torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim). + """ + bs = x.shape[0] + queries = self.query.unsqueeze(0).repeat(bs, 1, 1) + + query_num = self.patch_to_query_dict.get(x.shape[1], None) + assert ( + query_num is not None + ), f"Query number for {x.shape[1]} patches is not provided" + + queries = queries[:, :query_num, :] + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(x, queries, attn_mask=attn_mask) + + out = self.ffn(self.ln_ffn(attention_out)) + + return out + class AriaMoELMConfig(LlamaConfig): """ diff --git a/aria/vllm/projector.py b/aria/vllm/projector.py deleted file mode 100644 index 3b14525..0000000 --- a/aria/vllm/projector.py +++ /dev/null @@ -1,170 +0,0 @@ -import torch -import torch.nn as nn -from torch.nn.init import trunc_normal_ -from transformers.activations import ACT2FN - - -class FFN(nn.Module): - """ - Feed-Forward Network module. - - Args: - embed_dim (int): Input embedding dimension. - ff_dim (int): Hidden dimension of the feed-forward network. - output_dim (int): Output dimension. - """ - - def __init__(self, embed_dim, ff_dim, output_dim): - super().__init__() - self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False) - self.linear_out = nn.Linear(ff_dim, output_dim, bias=False) - self.act = ACT2FN["gelu_new"] - - def forward(self, hidden_states): - hidden_states = self.act(self.linear_in(hidden_states)) - hidden_states = self.linear_out(hidden_states) - return hidden_states - - -class CrossAttention(nn.Module): - """ - Cross-Attention module. - - Args: - kv_dim (int): Dimension of key and value. - embed_dim (int): Embedding dimension. - num_heads (int): Number of attention heads. - drop_out_rate (float): Dropout rate. Default is 0. - """ - - def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): - super().__init__() - self.num_heads = num_heads - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) - self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False) - self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False) - - self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) - self.linear = nn.Linear(embed_dim, embed_dim) - self.dropout = nn.Dropout(drop_out_rate) - - self.layer_norm = nn.LayerNorm(embed_dim) - self.ln_kv = nn.LayerNorm(kv_dim) - - def forward(self, x, hidden_states, attn_mask=None, add_residual=False): - """ - Forward pass of the CrossAttention module. - - Args: - x (torch.Tensor): Input tensor for key and value. - hidden_states (torch.Tensor): Input tensor for query. - attn_mask (torch.Tensor, optional): Attention mask. Default is None. - add_residual (bool): Whether to add residual connection. Default is False. - - Returns: - torch.Tensor: Output tensor after cross-attention. - """ - normed_hidden_states = self.layer_norm(hidden_states) - query = self.q_proj(normed_hidden_states).permute(1, 0, 2) - - x = self.ln_kv(x) - key = self.k_proj(x).permute(1, 0, 2) - value = self.v_proj(x).permute(1, 0, 2) - - attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) - - attn_output = attn_output.permute(1, 0, 2) - - if add_residual: - attn_output = hidden_states + self.dropout(self.linear(attn_output)) - else: - attn_output = self.dropout(self.linear(attn_output)) - - return attn_output - - -class AriaProjector(nn.Module): - """ - A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs. - - Args: - patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers, - e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution. - embed_dim (int): Embedding dimension. - num_heads (int): Number of attention heads. - kv_dim (int): Dimension of key and value. - ff_dim (int): Hidden dimension of the feed-forward network. - output_dim (int): Output dimension. - norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm. - - Outputs: - A tensor with the shape of (batch_size, query_number, output_dim) - """ - - def __init__( - self, - patch_to_query_dict, - embed_dim, - num_heads, - kv_dim, - ff_dim, - output_dim, - norm_layer=nn.LayerNorm, - ): - super().__init__() - self.patch_to_query_dict = patch_to_query_dict - self.embed_dim = embed_dim - self.num_heads = num_heads - - self.query = nn.Parameter( - torch.zeros(max(patch_to_query_dict.values()), self.embed_dim) - ) - - trunc_normal_(self.query, std=0.02) - - self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) - - self.ln_ffn = norm_layer(embed_dim) - self.ffn = FFN(embed_dim, ff_dim, output_dim) - - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def forward(self, x, attn_mask=None): - """ - Forward pass of the Projector module. - - Args: - x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim). - attn_mask (torch.Tensor, optional): Attention mask. Default is None. - - Returns: - torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim). - """ - bs = x.shape[0] - queries = self.query.unsqueeze(0).repeat(bs, 1, 1) - - query_num = self.patch_to_query_dict.get(x.shape[1], None) - assert ( - query_num is not None - ), f"Query number for {x.shape[1]} patches is not provided" - - queries = queries[:, :query_num, :] - - if attn_mask is not None: - attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) - attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) - - attention_out = self.cross_attn(x, queries, attn_mask=attn_mask) - - out = self.ffn(self.ln_ffn(attention_out)) - - return out diff --git a/aria/vllm/vision_encoder.py b/aria/vllm/vision_encoder.py deleted file mode 100644 index 9423027..0000000 --- a/aria/vllm/vision_encoder.py +++ /dev/null @@ -1,94 +0,0 @@ -from typing import Optional, Tuple - -import torch -import torch.nn as nn -from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig -from vllm.config import QuantizationConfig -from vllm.model_executor.models.idefics2_vision_model import Idefics2VisionTransformer - - -class AriaVisionConfig(Idefics2VisionConfig): - model_type = "aria_vision_model" - - -class IdentityOp(torch.nn.Module): - """ - An identity operation that returns the input unchanged. - - This can be used as a placeholder or to maintain architectural consistency - when a specific operation is not needed. - """ - - def __init__(self, *args, **kwargs): - super().__init__() - - def forward(self, x, *args, **kwargs): - return x - - -class AriaVisionTransformer(Idefics2VisionTransformer): - def __init__( - self, - config: AriaVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__(config, quant_config, prefix) - self.post_layernorm = IdentityOp() - - -class AriaVisionModel(nn.Module): - config_class = AriaVisionConfig - - def __init__( - self, - config: AriaVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - *, - prefix: str = "", - ) -> None: - super().__init__() - - self.vision_model = AriaVisionTransformer( - config, - quant_config, - prefix=f"{prefix}.vision_model", - ) - - def forward( - self, - pixel_values: torch.Tensor, - pixel_mask: Optional[torch.BoolTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]: - patch_attention_mask = self._create_patch_attention_mask(pixel_mask) - - vit_oup = self.vision_model( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask, - ) - - image_atts = self._create_image_attention_mask(patch_attention_mask) - - return vit_oup, image_atts - - def _create_patch_attention_mask(self, pixel_mask): - if pixel_mask is None: - return None - - patches_subgrid = pixel_mask.unfold( - dimension=1, - size=self.vision_model.config.patch_size, - step=self.vision_model.config.patch_size, - ).unfold( - dimension=2, - size=self.vision_model.config.patch_size, - step=self.vision_model.config.patch_size, - ) - return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - - def _create_image_attention_mask(self, patch_attention_mask): - if patch_attention_mask is None: - return None - - flattened_mask = patch_attention_mask.flatten(1) - return torch.logical_not(flattened_mask) From e40ac1a82b3594e361e5461043918f4815ff6f68 Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 21 Nov 2024 03:07:54 +0000 Subject: [PATCH 2/2] remove some comments --- aria/vllm/aria.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/aria/vllm/aria.py b/aria/vllm/aria.py index 2013fa5..56407f4 100644 --- a/aria/vllm/aria.py +++ b/aria/vllm/aria.py @@ -637,9 +637,6 @@ class MoELayer(nn.Module): This layer implements the MoE mechanism, which routes input tokens to different experts based on a routing algorithm, processes them through the experts, and then combines the outputs. - - Args: - config (AriaMoELMConfig): Configuration object for the MoE layer. """ def __init__( @@ -680,10 +677,6 @@ class MoEDecoderLayer(LlamaDecoderLayer): """ Custom Decoder Layer for the AriaMoE model which modifies the standard `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of Experts (MoE) Layer. - - Args: - config (LlamaConfig): Configuration object for the layer. - layer_idx (int): Index of the current layer in the model. """ def __init__( @@ -736,12 +729,6 @@ class AriaMoELMModel(LlamaModel): """ Custom LlamaModel for the AriaMoE model which modifies the standard LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`. - - This model implements a Mixture of Experts (MoE) approach, where each layer contains - multiple expert networks that specialize in different aspects of the input. - - Args: - config (LlamaConfig): Configuration object for the model. """ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -751,7 +738,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - # FIXME(zhoufan): this is a hack to avoid the error: AttributeError: 'AriaMoELMModel' object has no attribute 'do_not_compile'. + # FIXME: this is a hack to disable the compilation of the model self.do_not_compile = True self.layers = None