diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 32eed1a771718..9444dc43ea97e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -9,6 +9,7 @@ # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only +# nightly(bool): run this test in nightly pipeline only # optional(bool): never run this test by default (i.e. need to unblock manually) # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. @@ -330,18 +331,28 @@ steps: commands: - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py -- label: Decoder-only Multi-Modal Models Test # 1h31min +- label: Decoder-only Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language commands: - - pytest -v -s models/decoder_only/audio_language + - pytest -v -s models/decoder_only/audio_language -m core_model + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model + +- label: Decoder-only Multi-Modal Models Test (Extended) + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + commands: + - pytest -v -s models/decoder_only/audio_language -m 'not core_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model' - label: Other Models Test # 6min #mirror_hardwares: [amd] diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 83d2548a506e4..60cdb186331fe 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -262,10 +262,9 @@ def run_qwen2_vl(question: str, modality: str): model_name = "Qwen/Qwen2-VL-7B-Instruct" - # Tested on L40 llm = LLM( model=model_name, - max_model_len=8192, + max_model_len=4096, max_num_seqs=5, # Note - mm_processor_kwargs can also be passed to generate/chat calls mm_processor_kwargs={ diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index ad6c2d854d1f0..b9089e75ffab8 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -158,6 +158,7 @@ def run_multi_audio_test( assert all(tokens for tokens, *_ in vllm_outputs) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -178,6 +179,7 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, ) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index 5c90e7f7a267c..c23fbedf0c6ae 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -17,7 +17,7 @@ # Fixtures lazy import to avoid initializing CUDA during test collection -# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple +# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple # input mappers. @pytest.fixture() def image_input_mapper_for_qwen2_vl(): diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 9370527e3cd57..d738647c91b66 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -75,6 +75,63 @@ # this is a good idea for checking your command first, since tests are slow. VLM_TEST_SETTINGS = { + #### Core tests to always run in the CI + "llava": VLMTestInfo( + models=["llava-hf/llava-1.5-7b-hf"], + test_type=( + VLMTestType.EMBEDDING, + VLMTestType.IMAGE, + VLMTestType.CUSTOM_INPUTS + ), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", + convert_assets_to_embeddings=model_utils.get_llava_embeddings, + max_model_len=4096, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, + custom_test_opts=[CustomTestOptions( + inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( + formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" + ), + limit_mm_per_prompt={"image": 4}, + )], + marks=[pytest.mark.core_model], + ), + "paligemma": VLMTestInfo( + models=["google/paligemma-3b-mix-224"], + test_type=VLMTestType.IMAGE, + prompt_formatter=identity, + img_idx_to_prompt = lambda idx: "", + # Paligemma uses its own sample prompts because the default one fails + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "caption es", + "cherry_blossom": "What is in the picture?", + }), + auto_cls=AutoModelForVision2Seq, + postprocess_inputs=model_utils.get_key_type_post_processor( + "pixel_values" + ), + vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, + dtype="half" if current_platform.is_rocm() else ("half", "float"), + marks=[pytest.mark.core_model], + ), + "qwen2_vl": VLMTestInfo( + models=["Qwen/Qwen2-VL-2B-Instruct"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + marks=[pytest.mark.core_model], + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + ), + #### Extended model tests "blip2": VLMTestInfo( models=["Salesforce/blip2-opt-2.7b"], test_type=VLMTestType.IMAGE, @@ -151,25 +208,6 @@ use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, ), - "llava": VLMTestInfo( - models=["llava-hf/llava-1.5-7b-hf"], - test_type=( - VLMTestType.EMBEDDING, - VLMTestType.IMAGE, - VLMTestType.CUSTOM_INPUTS - ), - prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", - convert_assets_to_embeddings=model_utils.get_llava_embeddings, - max_model_len=4096, - auto_cls=AutoModelForVision2Seq, - vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - custom_test_opts=[CustomTestOptions( - inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( - formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" - ), - limit_mm_per_prompt={"image": 4}, - )], - ), "llava_next": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), @@ -200,12 +238,12 @@ vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, # Llava-one-vision tests fixed sizes & the default size factors image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - runner_mm_key="videos", custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 ), limit_mm_per_prompt={"video": 4}, + runner_mm_key="videos", )], ), # FIXME @@ -218,9 +256,11 @@ auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - runner_mm_key="videos", marks=[ - pytest.mark.skip(reason="LLava next video tests currently fail.") + pytest.mark.skipif( + transformers.__version__.startswith("4.46"), + reason="Model broken with changes in transformers 4.46" + ) ], ), "minicpmv": VLMTestInfo( @@ -234,23 +274,6 @@ postprocess_inputs=model_utils.wrap_inputs_post_processor, hf_output_post_proc=model_utils.minicmpv_trunc_hf_output, ), - "paligemma": VLMTestInfo( - models=["google/paligemma-3b-mix-224"], - test_type=VLMTestType.IMAGE, - prompt_formatter=identity, - img_idx_to_prompt = lambda idx: "", - # Paligemma uses its own sample prompts because the default one fails - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "caption es", - "cherry_blossom": "What is in the picture?", - }), - auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( - "pixel_values" - ), - vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, - dtype="half" if current_platform.is_rocm() else ("half", "float"), - ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. # https://github.com/huggingface/transformers/issues/34307 diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 6856e8df81a13..e925934db0e7c 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -56,6 +56,17 @@ def qwen_vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs +def qwen2_vllm_to_hf_output( + vllm_output: RunnerOutput, + model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + """Sanitize vllm output [qwen2 models] to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "<|im_end|>" + + return output_ids, hf_output_str, out_logprobs + + def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput: config = AutoConfig.from_pretrained(model) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py index 5a3f9e820dad0..2d3b39fe3594e 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py @@ -29,6 +29,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"image": 1}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -51,6 +52,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"image": len(image_assets)}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -74,6 +76,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo, limit_mm_per_prompt={"image": 1}, vllm_embeddings=vllm_embeddings, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -101,6 +104,7 @@ def run_video_test( num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"video": len(video_assets)}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="videos", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -115,7 +119,11 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, inputs = test_case.custom_test_opts.inputs limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt - assert inputs is not None and limit_mm_per_prompt is not None + runner_mm_key = test_case.custom_test_opts.runner_mm_key + # Inputs, limit_mm_per_prompt, and runner_mm_key should all be set + assert inputs is not None + assert limit_mm_per_prompt is not None + assert runner_mm_key is not None core.run_test( hf_runner=hf_runner, @@ -127,4 +135,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt=limit_mm_per_prompt, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key=runner_mm_key, **model_test_info.get_non_parametrized_runner_kwargs()) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index 4d18d53af30fa..fd18c7c8346f0 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -52,6 +52,8 @@ class SizeType(Enum): class CustomTestOptions(NamedTuple): inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]] limit_mm_per_prompt: Dict[str, int] + # kwarg to pass multimodal data in as to vllm/hf runner instances. + runner_mm_key: str = "images" class ImageSizeWrapper(NamedTuple): @@ -141,9 +143,6 @@ class VLMTestInfo(NamedTuple): Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]], str]] = None # noqa: E501 - # kwarg to pass multimodal data in as to vllm/hf runner instances - runner_mm_key: str = "images" - # Allows configuring a test to run with custom inputs custom_test_opts: Optional[List[CustomTestOptions]] = None @@ -168,7 +167,6 @@ def get_non_parametrized_runner_kwargs(self): "get_stop_token_ids": self.get_stop_token_ids, "model_kwargs": self.model_kwargs, "patch_hf_runner": self.patch_hf_runner, - "runner_mm_key": self.runner_mm_key, } diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index a8d0ac4fc160d..9fab5898a06ba 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -2,6 +2,7 @@ import pytest import torch.nn.functional as F +import transformers from transformers import AutoModelForVision2Seq from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner @@ -85,8 +86,8 @@ def _run_test( ) -# FIXME -@pytest.mark.skip(reason="LLava next embedding tests currently fail") +@pytest.mark.skipif(transformers.__version__.startswith("4.46"), + reason="Model broken with changes in transformers 4.46") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text(