[Frontend] Multimodal support in offline chat (vllm-project#8098)

ruisearch42 · Sep 4, 2024 · 855c262 · 855c262
1 parent 2be8ec6
commit 855c262
Show file tree

Hide file tree

Showing 8 changed files with 356 additions and 112 deletions.
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
@@ -6,6 +6,7 @@
 from vllm import LLM, RequestOutput, SamplingParams
 
 from ...conftest import cleanup
+from ..openai.test_vision import TEST_IMAGE_URLS
 
 MODEL_NAME = "facebook/opt-125m"
 
@@ -159,3 +160,36 @@ def test_chat():
     ]
     outputs = llm.chat(messages)
     assert len(outputs) == 1
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
@@ -1,11 +1,14 @@
 import warnings
+from typing import Optional
 
 import pytest
 from PIL import Image
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import parse_chat_messages
+from vllm.entrypoints.chat_utils import (parse_chat_messages,
+                                         parse_chat_messages_futures)
+from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
@@ -42,10 +45,28 @@ def image_url():
     return f"data:image/jpeg;base64,{base64}"
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_with_image_url(phi3v_model_config,
-                                                  phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def _assert_mm_data_is_image_input(
+    mm_data: Optional[MultiModalDataDict],
+    image_count: int,
+) -> None:
+    assert mm_data is not None
+    assert set(mm_data.keys()) == {"image"}
+
+    image_data = mm_data.get("image")
+    assert image_data is not None
+
+    if image_count == 1:
+        assert isinstance(image_data, Image.Image)
+    else:
+        assert isinstance(image_data, list) and len(image_data) == image_count
+
+
+def test_parse_chat_messages_single_image(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -63,15 +84,42 @@ async def test_parse_chat_messages_with_image_url(phi3v_model_config,
         "role": "user",
         "content": "<|image_1|>\nWhat's in the image?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert isinstance(mm_data["image"], Image.Image)
+    _assert_mm_data_is_image_input(mm_data, 1)
 
 
 @pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images(phi3v_model_config,
-                                                   phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+async def test_parse_chat_messages_single_image_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in the image?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+
+
+def test_parse_chat_messages_multiple_images(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -96,15 +144,49 @@ async def test_parse_chat_messages_multiple_images(phi3v_model_config,
         "content":
         "<|image_1|>\n<|image_2|>\nWhat's in these images?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
 @pytest.mark.asyncio
-async def test_parse_chat_messages_placeholder_already_in_prompt(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+async def test_parse_chat_messages_multiple_images_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+
+
+def test_parse_chat_messages_placeholder_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -131,15 +213,15 @@ async def test_parse_chat_messages_placeholder_already_in_prompt(
         "content":
         "What's in <|image_1|> and how does it compare to <|image_2|>?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_placeholder_one_already_in_prompt(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def test_parse_chat_messages_placeholder_one_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -167,15 +249,15 @@ async def test_parse_chat_messages_placeholder_one_already_in_prompt(
         "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
         "other one?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images_across_messages(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def test_parse_chat_messages_multiple_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -218,14 +300,14 @@ async def test_parse_chat_messages_multiple_images_across_messages(
             "content": "<|image_2|>\nWhat about this one?"
         },
     ]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
-        phi3v_model_config, phi3v_tokenizer, image_url):
+def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
@@ -259,9 +341,11 @@ async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
             }], phi3v_model_config, phi3v_tokenizer)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_rejects_too_many_images_across_messages(
-        phi3v_model_config, phi3v_tokenizer, image_url):
+def test_parse_chat_messages_rejects_too_many_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",