FEAT: support VL models for MLX (#2638)

xorbitsai · Dec 12, 2024 · 6b0bf6f · 6b0bf6f
1 parent dcf3fa8
commit 6b0bf6f
Show file tree

Hide file tree

Showing 8 changed files with 358 additions and 61 deletions.
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -117,33 +117,36 @@ jobs:
             sudo rm -rf "/usr/local/share/boost"
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
           fi
+          pip install -e ".[dev]"
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install mlx-lm
+            pip install mlx-vlm
             pip install mlx-whisper
+            pip install qwen-vl-utils
+          else
+            pip install "llama-cpp-python==0.2.77" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+            pip install transformers
+            pip install attrdict
+            pip install "timm>=0.9.16"
+            pip install torch torchvision
+            pip install accelerate
+            pip install sentencepiece
+            pip install transformers_stream_generator
+            pip install bitsandbytes
+            pip install "sentence-transformers>=2.3.1"
+            pip install modelscope
+            pip install diffusers
+            pip install protobuf
+            pip install FlagEmbedding
+            pip install "tenacity>=8.2.0,<8.4.0"
+            pip install "jinja2==3.1.2"
+            pip install tensorizer
+            pip install jj-pytorchvideo
+            pip install qwen-vl-utils
+            pip install datamodel_code_generator
+            pip install jsonschema
           fi
-          pip install "llama-cpp-python==0.2.77" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-          pip install transformers
-          pip install attrdict
-          pip install "timm>=0.9.16"
-          pip install torch torchvision
-          pip install accelerate
-          pip install sentencepiece
-          pip install transformers_stream_generator
-          pip install bitsandbytes
-          pip install "sentence-transformers>=2.3.1"
-          pip install modelscope
-          pip install diffusers
-          pip install protobuf
-          pip install FlagEmbedding
-          pip install "tenacity>=8.2.0,<8.4.0"
-          pip install -e ".[dev]"
-          pip install "jinja2==3.1.2"
-          pip install tensorizer
-          pip install jj-pytorchvideo
-          pip install qwen-vl-utils
-          pip install datamodel_code_generator
-          pip install jsonschema
         working-directory: .
 
       - name: Test with pytest

diff --git a/setup.cfg b/setup.cfg
@@ -100,7 +100,9 @@ all =
     outlines>=0.0.34
     sglang>=0.2.7 ; sys_platform=='linux'
     mlx-lm ; sys_platform=='darwin' and platform_machine=='arm64'
+    mlx-vlm ; sys_platform=='darwin' and platform_machine=='arm64'
     mlx-whisper ; sys_platform=='darwin' and platform_machine=='arm64'
+    qwen_vl_utils
     attrdict  # For deepseek VL
     timm>=0.9.16  # For deepseek VL
     torchvision  # For deepseek VL
@@ -180,7 +182,9 @@ sglang =
     outlines>=0.0.34
 mlx =
     mlx-lm
+    mlx-vlm
     mlx-whisper
+    qwen_vl_utils
 embedding =
     sentence-transformers>=3.1.0
 rerank =

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -131,7 +131,7 @@ def register_custom_model():
 def _install():
     from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
-    from .mlx.core import MLXChatModel, MLXModel
+    from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
     from .transformers.chatglm import ChatglmPytorchChatModel
     from .transformers.cogvlm2 import CogVLM2Model
@@ -172,7 +172,7 @@ def _install():
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
-    MLX_CLASSES.extend([MLXModel, MLXChatModel])
+    MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
     LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
     TRANSFORMERS_CLASSES.extend(
         [

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -6925,7 +6925,7 @@
         "model_id":"Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
         "model_revision":"800d396518c82960ce6d231adecd07bbc474f0a9"
       },
-         {
+      {
         "model_format":"awq",
         "model_size_in_billions":2,
         "quantizations":[
@@ -6934,6 +6934,15 @@
         "model_id":"Qwen/Qwen2-VL-2B-Instruct-AWQ",
         "model_revision":"ea8c5854c0044e28626719292de0d9b1a671f6fc"
       },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":2,
+        "quantizations":[
+          "4bit",
+          "8bit"
+        ],
+        "model_id":"mlx-community/Qwen2-VL-2B-Instruct-{quantization}"
+      },
       {
         "model_format":"pytorch",
         "model_size_in_billions":7,
@@ -6970,6 +6979,15 @@
         "model_id":"Qwen/Qwen2-VL-7B-Instruct-AWQ",
         "model_revision":"f94216e8b513933bccd567bcd9b7350199f32538"
       },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "4bit",
+          "8bit"
+        ],
+        "model_id":"mlx-community/Qwen2-VL-7B-Instruct-{quantization}"
+      },
       {
         "model_format":"pytorch",
         "model_size_in_billions":72,
@@ -6994,6 +7012,15 @@
           "Int8"
         ],
         "model_id":"Qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "4bit",
+          "8bit"
+        ],
+        "model_id":"mlx-community/Qwen2-VL-72B-Instruct-{quantization}"
       }
     ],
     "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -4649,6 +4649,16 @@
         "model_id":"qwen/Qwen2-VL-7B-Instruct-AWQ",
         "model_revision":"master"
       },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "8bit"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"okwinds/Qwen2-VL-7B-Instruct-MLX-8bit",
+        "model_revision":"master"
+      },
       {
         "model_format":"pytorch",
         "model_size_in_billions":2,
@@ -4689,6 +4699,16 @@
         "model_id":"qwen/Qwen2-VL-2B-Instruct-AWQ",
         "model_revision":"master"
       },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":2,
+        "quantizations":[
+          "8bit"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"okwinds/Qwen2-VL-2B-Instruct-MLX-8bit",
+        "model_revision":"master"
+      },
       {
         "model_format":"pytorch",
         "model_size_in_billions":72,
@@ -4716,6 +4736,17 @@
         ],
         "model_id":"qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "4bit",
+          "8bit"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"okwinds/Qwen2-VL-72B-Instruct-MLX-{quantization}",
+        "model_revision":"master"
       }
     ],
     "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",