Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1][VLM] V1 support for selected single-image models. #11632

Merged
merged 43 commits into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
425d3c4
batch
ywang96 Dec 29, 2024
1ca9369
Merge branch 'vllm-project:main' into v1-rearch-vlm
ywang96 Dec 29, 2024
8edcc83
blip2
ywang96 Dec 30, 2024
5f76291
chameleon
ywang96 Dec 30, 2024
814f3bd
fix util
ywang96 Dec 30, 2024
efeb999
fuyu
ywang96 Dec 30, 2024
5e568e8
aria
ywang96 Dec 30, 2024
135fd5c
fix profiling
ywang96 Dec 30, 2024
0a8dbe0
update
ywang96 Dec 30, 2024
03f741d
add llava-next
ywang96 Dec 30, 2024
8bce949
revert testing code
ywang96 Dec 30, 2024
bbde414
revert testing code
ywang96 Dec 30, 2024
ea928c6
tweak and clarify
ywang96 Dec 30, 2024
55eada7
clarify
ywang96 Dec 30, 2024
bbd5752
reword
ywang96 Dec 30, 2024
0452b99
Use merged multi-modal processor for blip2 and chameleon
DarkLight1337 Dec 30, 2024
938c0bf
Limit max num seqs
DarkLight1337 Dec 30, 2024
6cc54a7
Update comments
DarkLight1337 Dec 30, 2024
ba713ba
Be more clear
DarkLight1337 Dec 30, 2024
b0efc4f
Merged multi-modal processor for Aria
DarkLight1337 Dec 31, 2024
cdbd969
initialize fuyu merged processor
Isotr0py Dec 31, 2024
48c6946
Clean up
DarkLight1337 Dec 31, 2024
ea76759
Clean up
DarkLight1337 Dec 31, 2024
bc976a7
Try remove mark
DarkLight1337 Dec 31, 2024
f79f79a
Consolidate dummy data code
DarkLight1337 Dec 31, 2024
45ec10c
fix fuyu variant images test
Isotr0py Dec 31, 2024
0926717
Merge branch 'main' into v1-rearch-vlm
DarkLight1337 Dec 31, 2024
0fe561d
Fix some type errors in Pixtral-HF
DarkLight1337 Dec 31, 2024
3512ed6
fix missing flatten_bn in fuyu
Isotr0py Dec 31, 2024
5e0f66c
Update docs
DarkLight1337 Dec 31, 2024
1c243ab
Update docs
DarkLight1337 Dec 31, 2024
09d64f4
Get fuyu processor tests to pass
DarkLight1337 Dec 31, 2024
6d6d71c
Oops
DarkLight1337 Dec 31, 2024
ea93a2c
Fix unable to run model
DarkLight1337 Dec 31, 2024
9aeb7b2
Avoid warning from HF
DarkLight1337 Dec 31, 2024
768c1d9
fix too large image for fuyu
Isotr0py Dec 31, 2024
0c82c51
fix prompt token ids
Isotr0py Dec 31, 2024
d0d1fdc
Fix missing batch dimension in vision embeddings
DarkLight1337 Dec 31, 2024
afcf7b1
fix variant patches batching
Isotr0py Dec 31, 2024
cb9522d
Simplify the code
DarkLight1337 Dec 31, 2024
df832df
format
DarkLight1337 Dec 31, 2024
868e8e9
Merge branch 'vllm-project:main' into v1-rearch-vlm
ywang96 Dec 31, 2024
cc9c5f1
simplify
ywang96 Dec 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -570,28 +570,28 @@ See [this page](#generative-models) for more information on how to use generativ
- `rhymes-ai/Aria`
-
- ✅︎
-
- ✅︎
* - `Blip2ForConditionalGeneration`
- BLIP-2
- T + I<sup>E</sup>
- `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
-
- ✅︎
-
- ✅︎
* - `ChameleonForConditionalGeneration`
- Chameleon
- T + I
- `facebook/chameleon-7b` etc.
-
- ✅︎
-
- ✅︎
* - `FuyuForCausalLM`
- Fuyu
- T + I
- `adept/fuyu-8b` etc.
-
- ✅︎
-
- ✅︎
* - `ChatGLMModel`
- GLM-4V
- T + I
Expand Down Expand Up @@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-
- ✅︎
-
- ✅︎
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Llava-next was already supported on V1 so this is just a doc update.

* - `LlavaNextVideoForConditionalGeneration`
- LLaVA-NeXT-Video
- T + V
Expand Down
10 changes: 8 additions & 2 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ def run_aria(question: str, modality: str):
assert modality == "image"
model_name = "rhymes-ai/Aria"

# NOTE: Need L40 (or equivalent) to avoid OOM
llm = LLM(model=model_name,
tokenizer_mode="slow",
trust_remote_code=True,
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)

prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
Expand Down Expand Up @@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str):
prompt = f"{question}<image>"
llm = LLM(model="facebook/chameleon-7b",
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
stop_token_ids = None
return llm, prompt, stop_token_ids
Expand Down Expand Up @@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str):
# 2.5
# model_name = "openbmb/MiniCPM-Llama3-V-2_5"

#2.6
# 2.6
model_name = "openbmb/MiniCPM-V-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
Expand Down Expand Up @@ -430,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str):

model_name = "mistral-community/pixtral-12b"

# NOTE: Need L40 (or equivalent) to avoid OOM
llm = LLM(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)

Expand Down
7 changes: 2 additions & 5 deletions tests/models/decoder_only/vision_language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,7 @@
"aria": VLMTestInfo(
models=["rhymes-ai/Aria"],
tokenizer_mode="slow",
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
),
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
dtype="bfloat16",
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
Expand Down Expand Up @@ -179,6 +176,7 @@
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
Expand All @@ -201,7 +199,6 @@
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[large_gpu_mark(min_gb=48)],
),
"glm4": VLMTestInfo(
models=["THUDM/glm-4v-9b"],
Expand Down
29 changes: 16 additions & 13 deletions tests/multimodal/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def _rand_audio(

def _test_processing_cache_correctness(
model_id: str,
modalities: set[str],
modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
Expand Down Expand Up @@ -583,9 +583,8 @@ def _test_processing_cache_correctness(
partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
}
input_max_count = {
"image": 3,
"video": 3,
"audio": 3,
modality: 3 if supports_multi else 1
for modality, supports_multi in modalities.items()
}

for batch_idx in range(num_batches):
Expand Down Expand Up @@ -624,20 +623,24 @@ def _test_processing_cache_correctness(

# yapf: disable
@pytest.mark.parametrize(("model_id", "modalities"), [
("llava-hf/llava-1.5-7b-hf", {"image"}),
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
("mistral-community/pixtral-12b", {"image"}),
("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
("fixie-ai/ultravox-v0_3", {"audio"}),
("rhymes-ai/Aria", {"image": True}),
("Salesforce/blip2-opt-2.7b", {"image": False}),
("facebook/chameleon-7b", {"image": True}),
("adept/fuyu-8b", {"image": False}),
("llava-hf/llava-1.5-7b-hf", {"image": True}),
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
("mistral-community/pixtral-12b", {"image": True}),
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
("fixie-ai/ultravox-v0_3", {"audio": True}),
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_cache_correctness(
model_id: str,
modalities: set[str],
modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
Expand All @@ -653,15 +656,15 @@ def test_processing_cache_correctness(

# yapf: disable
@pytest.mark.parametrize(("model_id", "modalities"), [
("microsoft/Phi-3-vision-128k-instruct", {"image"}),
("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_cache_correctness_phi3v(
model_id: str,
modalities: set[str],
modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
Expand Down
Loading
Loading