From eba7955a198ee4c8f1e9e00e505a9b5993c0c14c Mon Sep 17 00:00:00 2001
From: xffxff <1247714429@qq.com>
Date: Thu, 21 Nov 2024 01:41:21 +0000
Subject: [PATCH 1/2] refactor(vllm): one file for vllm

---
 aria/vllm/aria.py           | 432 +++++++++++++++++++++++++++++++++++-
 aria/vllm/projector.py      | 170 --------------
 aria/vllm/vision_encoder.py |  94 --------
 3 files changed, 429 insertions(+), 267 deletions(-)
 delete mode 100644 aria/vllm/projector.py
 delete mode 100644 aria/vllm/vision_encoder.py

diff --git a/aria/vllm/aria.py b/aria/vllm/aria.py
index f6abe09..2013fa5 100644
--- a/aria/vllm/aria.py
+++ b/aria/vllm/aria.py
@@ -65,11 +65,437 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .projector import AriaProjector
-from .vision_encoder import AriaVisionModel
-
 logger = logging.get_logger(__name__)
 
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from transformers.activations import ACT2FN
+from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
+from vllm.config import QuantizationConfig
+from vllm.model_executor.models.idefics2_vision_model import Idefics2VisionTransformer
+
+
+class AriaVisionConfig(Idefics2VisionConfig):
+    model_type = "aria_vision_model"
+
+
+class IdentityOp(torch.nn.Module):
+    """
+    An identity operation that returns the input unchanged.
+
+    This can be used as a placeholder or to maintain architectural consistency
+    when a specific operation is not needed.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+
+class AriaVisionTransformer(Idefics2VisionTransformer):
+    def __init__(
+        self,
+        config: AriaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix)
+        self.post_layernorm = IdentityOp()
+
+
+class AriaVisionModel(nn.Module):
+    config_class = AriaVisionConfig
+
+    def __init__(
+        self,
+        config: AriaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.vision_model = AriaVisionTransformer(
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_model",
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]:
+        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
+
+        vit_oup = self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+
+        image_atts = self._create_image_attention_mask(patch_attention_mask)
+
+        return vit_oup, image_atts
+
+    def _create_patch_attention_mask(self, pixel_mask):
+        if pixel_mask is None:
+            return None
+
+        patches_subgrid = pixel_mask.unfold(
+            dimension=1,
+            size=self.vision_model.config.patch_size,
+            step=self.vision_model.config.patch_size,
+        ).unfold(
+            dimension=2,
+            size=self.vision_model.config.patch_size,
+            step=self.vision_model.config.patch_size,
+        )
+        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+    def _create_image_attention_mask(self, patch_attention_mask):
+        if patch_attention_mask is None:
+            return None
+
+        flattened_mask = patch_attention_mask.flatten(1)
+        return torch.logical_not(flattened_mask)
+
+
+class FFN(nn.Module):
+    """
+    Feed-Forward Network module.
+
+    Args:
+        embed_dim (int): Input embedding dimension.
+        ff_dim (int): Hidden dimension of the feed-forward network.
+        output_dim (int): Output dimension.
+    """
+
+    def __init__(self, embed_dim, ff_dim, output_dim):
+        super().__init__()
+        self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False)
+        self.linear_out = nn.Linear(ff_dim, output_dim, bias=False)
+        self.act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_states = self.act(self.linear_in(hidden_states))
+        hidden_states = self.linear_out(hidden_states)
+        return hidden_states
+
+
+class CrossAttention(nn.Module):
+    """
+    Cross-Attention module.
+
+    Args:
+        kv_dim (int): Dimension of key and value.
+        embed_dim (int): Embedding dimension.
+        num_heads (int): Number of attention heads.
+        drop_out_rate (float): Dropout rate. Default is 0.
+    """
+
+    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+
+        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.linear = nn.Linear(embed_dim, embed_dim)
+        self.dropout = nn.Dropout(drop_out_rate)
+
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.ln_kv = nn.LayerNorm(kv_dim)
+
+    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
+        """
+        Forward pass of the CrossAttention module.
+
+        Args:
+            x (torch.Tensor): Input tensor for key and value.
+            hidden_states (torch.Tensor): Input tensor for query.
+            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
+            add_residual (bool): Whether to add residual connection. Default is False.
+
+        Returns:
+            torch.Tensor: Output tensor after cross-attention.
+        """
+        normed_hidden_states = self.layer_norm(hidden_states)
+        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
+
+        x = self.ln_kv(x)
+        key = self.k_proj(x).permute(1, 0, 2)
+        value = self.v_proj(x).permute(1, 0, 2)
+
+        attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask)
+
+        attn_output = attn_output.permute(1, 0, 2)
+
+        if add_residual:
+            attn_output = hidden_states + self.dropout(self.linear(attn_output))
+        else:
+            attn_output = self.dropout(self.linear(attn_output))
+
+        return attn_output
+
+
+class AriaProjector(nn.Module):
+    """
+    A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs.
+
+    Args:
+        patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers,
+            e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution.
+        embed_dim (int): Embedding dimension.
+        num_heads (int): Number of attention heads.
+        kv_dim (int): Dimension of key and value.
+        ff_dim (int): Hidden dimension of the feed-forward network.
+        output_dim (int): Output dimension.
+        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
+
+    Outputs:
+        A tensor with the shape of (batch_size, query_number, output_dim)
+    """
+
+    def __init__(
+        self,
+        patch_to_query_dict,
+        embed_dim,
+        num_heads,
+        kv_dim,
+        ff_dim,
+        output_dim,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.patch_to_query_dict = patch_to_query_dict
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(
+            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)
+        )
+
+        trunc_normal_(self.query, std=0.02)
+
+        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
+
+        self.ln_ffn = norm_layer(embed_dim)
+        self.ffn = FFN(embed_dim, ff_dim, output_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None):
+        """
+        Forward pass of the Projector module.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim).
+            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim).
+        """
+        bs = x.shape[0]
+        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
+
+        query_num = self.patch_to_query_dict.get(x.shape[1], None)
+        assert (
+            query_num is not None
+        ), f"Query number for {x.shape[1]} patches is not provided"
+
+        queries = queries[:, :query_num, :]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
+            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
+
+        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
+
+        out = self.ffn(self.ln_ffn(attention_out))
+
+        return out
+
+
+class FFN(nn.Module):
+    """
+    Feed-Forward Network module.
+
+    Args:
+        embed_dim (int): Input embedding dimension.
+        ff_dim (int): Hidden dimension of the feed-forward network.
+        output_dim (int): Output dimension.
+    """
+
+    def __init__(self, embed_dim, ff_dim, output_dim):
+        super().__init__()
+        self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False)
+        self.linear_out = nn.Linear(ff_dim, output_dim, bias=False)
+        self.act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_states = self.act(self.linear_in(hidden_states))
+        hidden_states = self.linear_out(hidden_states)
+        return hidden_states
+
+
+class CrossAttention(nn.Module):
+    """
+    Cross-Attention module.
+
+    Args:
+        kv_dim (int): Dimension of key and value.
+        embed_dim (int): Embedding dimension.
+        num_heads (int): Number of attention heads.
+        drop_out_rate (float): Dropout rate. Default is 0.
+    """
+
+    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+
+        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.linear = nn.Linear(embed_dim, embed_dim)
+        self.dropout = nn.Dropout(drop_out_rate)
+
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.ln_kv = nn.LayerNorm(kv_dim)
+
+    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
+        """
+        Forward pass of the CrossAttention module.
+
+        Args:
+            x (torch.Tensor): Input tensor for key and value.
+            hidden_states (torch.Tensor): Input tensor for query.
+            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
+            add_residual (bool): Whether to add residual connection. Default is False.
+
+        Returns:
+            torch.Tensor: Output tensor after cross-attention.
+        """
+        normed_hidden_states = self.layer_norm(hidden_states)
+        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
+
+        x = self.ln_kv(x)
+        key = self.k_proj(x).permute(1, 0, 2)
+        value = self.v_proj(x).permute(1, 0, 2)
+
+        attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask)
+
+        attn_output = attn_output.permute(1, 0, 2)
+
+        if add_residual:
+            attn_output = hidden_states + self.dropout(self.linear(attn_output))
+        else:
+            attn_output = self.dropout(self.linear(attn_output))
+
+        return attn_output
+
+
+class AriaProjector(nn.Module):
+    """
+    A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs.
+
+    Args:
+        patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers,
+            e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution.
+        embed_dim (int): Embedding dimension.
+        num_heads (int): Number of attention heads.
+        kv_dim (int): Dimension of key and value.
+        ff_dim (int): Hidden dimension of the feed-forward network.
+        output_dim (int): Output dimension.
+        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
+
+    Outputs:
+        A tensor with the shape of (batch_size, query_number, output_dim)
+    """
+
+    def __init__(
+        self,
+        patch_to_query_dict,
+        embed_dim,
+        num_heads,
+        kv_dim,
+        ff_dim,
+        output_dim,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.patch_to_query_dict = patch_to_query_dict
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(
+            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)
+        )
+
+        trunc_normal_(self.query, std=0.02)
+
+        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
+
+        self.ln_ffn = norm_layer(embed_dim)
+        self.ffn = FFN(embed_dim, ff_dim, output_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None):
+        """
+        Forward pass of the Projector module.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim).
+            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim).
+        """
+        bs = x.shape[0]
+        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
+
+        query_num = self.patch_to_query_dict.get(x.shape[1], None)
+        assert (
+            query_num is not None
+        ), f"Query number for {x.shape[1]} patches is not provided"
+
+        queries = queries[:, :query_num, :]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
+            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
+
+        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
+
+        out = self.ffn(self.ln_ffn(attention_out))
+
+        return out
+
 
 class AriaMoELMConfig(LlamaConfig):
     """
diff --git a/aria/vllm/projector.py b/aria/vllm/projector.py
deleted file mode 100644
index 3b14525..0000000
--- a/aria/vllm/projector.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn.init import trunc_normal_
-from transformers.activations import ACT2FN
-
-
-class FFN(nn.Module):
-    """
-    Feed-Forward Network module.
-
-    Args:
-        embed_dim (int): Input embedding dimension.
-        ff_dim (int): Hidden dimension of the feed-forward network.
-        output_dim (int): Output dimension.
-    """
-
-    def __init__(self, embed_dim, ff_dim, output_dim):
-        super().__init__()
-        self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False)
-        self.linear_out = nn.Linear(ff_dim, output_dim, bias=False)
-        self.act = ACT2FN["gelu_new"]
-
-    def forward(self, hidden_states):
-        hidden_states = self.act(self.linear_in(hidden_states))
-        hidden_states = self.linear_out(hidden_states)
-        return hidden_states
-
-
-class CrossAttention(nn.Module):
-    """
-    Cross-Attention module.
-
-    Args:
-        kv_dim (int): Dimension of key and value.
-        embed_dim (int): Embedding dimension.
-        num_heads (int): Number of attention heads.
-        drop_out_rate (float): Dropout rate. Default is 0.
-    """
-
-    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
-        super().__init__()
-        self.num_heads = num_heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
-        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-
-        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        self.linear = nn.Linear(embed_dim, embed_dim)
-        self.dropout = nn.Dropout(drop_out_rate)
-
-        self.layer_norm = nn.LayerNorm(embed_dim)
-        self.ln_kv = nn.LayerNorm(kv_dim)
-
-    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
-        """
-        Forward pass of the CrossAttention module.
-
-        Args:
-            x (torch.Tensor): Input tensor for key and value.
-            hidden_states (torch.Tensor): Input tensor for query.
-            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
-            add_residual (bool): Whether to add residual connection. Default is False.
-
-        Returns:
-            torch.Tensor: Output tensor after cross-attention.
-        """
-        normed_hidden_states = self.layer_norm(hidden_states)
-        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
-
-        x = self.ln_kv(x)
-        key = self.k_proj(x).permute(1, 0, 2)
-        value = self.v_proj(x).permute(1, 0, 2)
-
-        attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask)
-
-        attn_output = attn_output.permute(1, 0, 2)
-
-        if add_residual:
-            attn_output = hidden_states + self.dropout(self.linear(attn_output))
-        else:
-            attn_output = self.dropout(self.linear(attn_output))
-
-        return attn_output
-
-
-class AriaProjector(nn.Module):
-    """
-    A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs.
-
-    Args:
-        patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers,
-            e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution.
-        embed_dim (int): Embedding dimension.
-        num_heads (int): Number of attention heads.
-        kv_dim (int): Dimension of key and value.
-        ff_dim (int): Hidden dimension of the feed-forward network.
-        output_dim (int): Output dimension.
-        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
-
-    Outputs:
-        A tensor with the shape of (batch_size, query_number, output_dim)
-    """
-
-    def __init__(
-        self,
-        patch_to_query_dict,
-        embed_dim,
-        num_heads,
-        kv_dim,
-        ff_dim,
-        output_dim,
-        norm_layer=nn.LayerNorm,
-    ):
-        super().__init__()
-        self.patch_to_query_dict = patch_to_query_dict
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-
-        self.query = nn.Parameter(
-            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)
-        )
-
-        trunc_normal_(self.query, std=0.02)
-
-        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
-
-        self.ln_ffn = norm_layer(embed_dim)
-        self.ffn = FFN(embed_dim, ff_dim, output_dim)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def forward(self, x, attn_mask=None):
-        """
-        Forward pass of the Projector module.
-
-        Args:
-            x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim).
-            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
-
-        Returns:
-            torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim).
-        """
-        bs = x.shape[0]
-        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
-
-        query_num = self.patch_to_query_dict.get(x.shape[1], None)
-        assert (
-            query_num is not None
-        ), f"Query number for {x.shape[1]} patches is not provided"
-
-        queries = queries[:, :query_num, :]
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
-            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
-
-        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
-
-        out = self.ffn(self.ln_ffn(attention_out))
-
-        return out
diff --git a/aria/vllm/vision_encoder.py b/aria/vllm/vision_encoder.py
deleted file mode 100644
index 9423027..0000000
--- a/aria/vllm/vision_encoder.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
-from vllm.config import QuantizationConfig
-from vllm.model_executor.models.idefics2_vision_model import Idefics2VisionTransformer
-
-
-class AriaVisionConfig(Idefics2VisionConfig):
-    model_type = "aria_vision_model"
-
-
-class IdentityOp(torch.nn.Module):
-    """
-    An identity operation that returns the input unchanged.
-
-    This can be used as a placeholder or to maintain architectural consistency
-    when a specific operation is not needed.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-
-    def forward(self, x, *args, **kwargs):
-        return x
-
-
-class AriaVisionTransformer(Idefics2VisionTransformer):
-    def __init__(
-        self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__(config, quant_config, prefix)
-        self.post_layernorm = IdentityOp()
-
-
-class AriaVisionModel(nn.Module):
-    config_class = AriaVisionConfig
-
-    def __init__(
-        self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        *,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-
-        self.vision_model = AriaVisionTransformer(
-            config,
-            quant_config,
-            prefix=f"{prefix}.vision_model",
-        )
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        pixel_mask: Optional[torch.BoolTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]:
-        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
-
-        vit_oup = self.vision_model(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask,
-        )
-
-        image_atts = self._create_image_attention_mask(patch_attention_mask)
-
-        return vit_oup, image_atts
-
-    def _create_patch_attention_mask(self, pixel_mask):
-        if pixel_mask is None:
-            return None
-
-        patches_subgrid = pixel_mask.unfold(
-            dimension=1,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        ).unfold(
-            dimension=2,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        )
-        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
-
-    def _create_image_attention_mask(self, patch_attention_mask):
-        if patch_attention_mask is None:
-            return None
-
-        flattened_mask = patch_attention_mask.flatten(1)
-        return torch.logical_not(flattened_mask)

From e40ac1a82b3594e361e5461043918f4815ff6f68 Mon Sep 17 00:00:00 2001
From: xffxff <1247714429@qq.com>
Date: Thu, 21 Nov 2024 03:07:54 +0000
Subject: [PATCH 2/2] remove some comments

---
 aria/vllm/aria.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/aria/vllm/aria.py b/aria/vllm/aria.py
index 2013fa5..56407f4 100644
--- a/aria/vllm/aria.py
+++ b/aria/vllm/aria.py
@@ -637,9 +637,6 @@ class MoELayer(nn.Module):
     This layer implements the MoE mechanism, which routes input tokens to different experts
     based on a routing algorithm, processes them through the experts, and then combines
     the outputs.
-
-    Args:
-        config (AriaMoELMConfig): Configuration object for the MoE layer.
     """
 
     def __init__(
@@ -680,10 +677,6 @@ class MoEDecoderLayer(LlamaDecoderLayer):
     """
     Custom Decoder Layer for the AriaMoE model which modifies the standard `LlamaDecoderLayer` by
     replacing the traditional MLP with a Mixture of Experts (MoE) Layer.
-
-    Args:
-        config (LlamaConfig): Configuration object for the layer.
-        layer_idx (int): Index of the current layer in the model.
     """
 
     def __init__(
@@ -736,12 +729,6 @@ class AriaMoELMModel(LlamaModel):
     """
     Custom LlamaModel for the AriaMoE model which modifies the standard LlamaModel by
     replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
-
-    This model implements a Mixture of Experts (MoE) approach, where each layer contains
-    multiple expert networks that specialize in different aspects of the input.
-
-    Args:
-        config (LlamaConfig): Configuration object for the model.
     """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -751,7 +738,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        # FIXME(zhoufan): this is a hack to avoid the error: AttributeError: 'AriaMoELMModel' object has no attribute 'do_not_compile'.
+        # FIXME: this is a hack to disable the compilation of the model
         self.do_not_compile = True
 
         self.layers = None