rhymes-ai · aria-hacker · Oct 15, 2024 · Oct 15, 2024
diff --git a/aria/model/configuration_aria.py b/aria/model/configuration_aria.py
@@ -68,6 +68,8 @@ def __init__(
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
 
+        attn_implementation = kwargs.pop("attn_implementation", None)
+
         # Convert the keys and values of projector_patch_to_query_dict to integers
         # This ensures consistency even if they were provided as strings
         self.projector_patch_to_query_dict = {
@@ -76,10 +78,20 @@ def __init__(
 
         if isinstance(vision_config, dict) and "model_type" in vision_config:
             vision_config = AriaVisionConfig(**vision_config)
+            vision_attn_implementation = (
+                "flash_attention_2"
+                if attn_implementation is None
+                else attn_implementation
+            )
+            vision_config._attn_implementation = vision_attn_implementation
 
         self.vision_config = vision_config
 
         if isinstance(text_config, dict) and "model_type" in text_config:
+            text_attn_implementation = (
+                "sdpa" if attn_implementation is None else attn_implementation
+            )
             text_config = AriaMoELMConfig(**text_config)
+            text_config._attn_implementation = text_attn_implementation
 
         self.text_config = text_config
diff --git a/aria/model/vision_encoder.py b/aria/model/vision_encoder.py
@@ -38,7 +38,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self._attn_implementation = "flash_attention_2"
 
 
 class IdentityOp(torch.nn.Module):
@@ -83,6 +82,7 @@ class AriaVisionModel(SiglipVisionModel):
 
     config_class = AriaVisionConfig
     main_input_name = "pixel_values"
+    _supports_sdpa = False
 
     def __init__(self, config: AriaVisionConfig):
         super().__init__(config)