From 9279b9f83dd3aa5bb3d3ce57bf92d9361755d164 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 6 Jan 2025 05:48:53 -0800 Subject: [PATCH] [Bugfix] Fix max image size for LLaVA-Onevision (#11769) Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 6dccc1e0d3b8d..5eac2f223d794 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -170,6 +170,22 @@ def _get_num_video_tokens( class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin, BaseLlavaProfilingInfo): + def _get_image_size_with_most_features(self) -> ImageSize: + hf_config = self._get_hf_config() + largest_feature_size, largest_feature_pinpoint = 0, None + for (height, width) in hf_config.image_grid_pinpoints: + feat_size = self._get_num_image_tokens(image_width=width, + image_height=height) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None}