Skip to content

Commit

Permalink
[Model] Adding Support for Qwen2VL as an Embedding Model. Using MrLig…
Browse files Browse the repository at this point in the history
…ht/dse-qwen2-2b-mrl-v1 (#9944)

Signed-off-by: FurtherAI <[email protected]>
Co-authored-by: FurtherAI <[email protected]>
  • Loading branch information
FurtherAI and FurtherAI authored Nov 13, 2024
1 parent 3945c82 commit 1b886aa
Show file tree
Hide file tree
Showing 8 changed files with 364 additions and 19 deletions.
6 changes: 6 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,12 @@ Multimodal Embedding
- :code:`TIGER-Lab/VLM2Vec-Full`
- 🚧
- ✅︎
* - :code:`Qwen2VLForConditionalGeneration`
- Qwen2-VL-based
- T + I
- :code:`MrLight/dse-qwen2-2b-mrl-v1`
-
- ✅︎

.. important::
Some model architectures support both generation and embedding tasks.
Expand Down
17 changes: 17 additions & 0 deletions docs/source/models/vlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -310,4 +310,21 @@ Since the request schema is not defined by OpenAI client, we post a request to t
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])
Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.

.. code-block:: bash
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
--trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
.. important::

Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings,
which is handled by the jinja template.

.. important::

Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code
example below for details.

A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
123 changes: 105 additions & 18 deletions examples/openai_chat_embedding_client_for_multimodal.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,120 @@
import argparse
import base64
import io

import requests
from PIL import Image

image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model":
"TIGER-Lab/VLM2Vec-Full",
"messages": [{

def vlm2vec():
response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model":
"TIGER-Lab/VLM2Vec-Full",
"messages": [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "Represent the given image."
},
],
}],
"encoding_format":
"float",
},
)
response.raise_for_status()
response_json = response.json()

print("Embedding output:", response_json["data"][0]["embedding"])


def dse_qwen2_vl(inp: dict):
# Embedding an Image
if inp["dtype"] == "image":
messages = [{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": inp["image_url"],
}
}, {
"type": "text",
"text": "What is shown in this image?"
}]
}]
# Embedding a Text Query
else:
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer = io.BytesIO()
image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png")
buffer.seek(0)
image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
"url": f"data:image/jpeg;base64,{image_placeholder}",
}
},
{
"type": "text",
"text": "Represent the given image."
"text": f"Query: {inp['content']}"
},
],
}],
"encoding_format":
"float",
},
)
response.raise_for_status()
response_json = response.json()

print("Embedding output:", response_json["data"][0]["embedding"])
]
}]

response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model": "MrLight/dse-qwen2-2b-mrl-v1",
"messages": messages,
"encoding_format": "float",
},
)
response.raise_for_status()
response_json = response.json()

print("Embedding output:", response_json["data"][0]["embedding"])


if __name__ == '__main__':
parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embedding before running this.")
parser.add_argument("model",
type=str,
choices=["vlm2vec", "dse_qwen2_vl"],
required=True,
help="Which model to call.")
args = parser.parse_args()

if args.model == "vlm2vec":
vlm2vec()
elif args.model == "dse_qwen2_vl":
dse_qwen2_vl({
"dtye": "image",
"image_url": image_url,
})
dse_qwen2_vl({
"dtype": "text",
"content": "What is the weather like today?",
})
7 changes: 7 additions & 0 deletions examples/template_dse_qwen2_vl.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
You are a helpful assistant.<|im_end|>
{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
{% endraw %}{% endif %}<|endoftext|>
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ def video_assets() -> _VideoAssets:
class HfRunner:

def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
if x is None or isinstance(x, (bool, )):
return x

if device is None:
device = "cpu" if current_platform.is_cpu() else "cuda"

Expand Down
Loading

0 comments on commit 1b886aa

Please sign in to comment.