Skip to content

Commit

Permalink
Cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <[email protected]>
  • Loading branch information
DarkLight1337 committed Jan 7, 2025
1 parent 62942e3 commit b7e5324
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 50 deletions.
4 changes: 2 additions & 2 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
return {"image": self.get_max_image_tokens()}

def apply_feature_select_strategy(
def _apply_feature_select_strategy(
self,
strategy: str,
encoder_num_image_tokens: int,
Expand All @@ -145,7 +145,7 @@ def get_num_image_tokens(
hf_config = self.get_hf_config()
vision_encoder_info = self.get_vision_encoder_info()

return self.apply_feature_select_strategy(
return self._apply_feature_select_strategy(
hf_config.vision_feature_select_strategy,
vision_encoder_info.get_num_image_tokens(
image_width=image_width,
Expand Down
25 changes: 17 additions & 8 deletions vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from abc import abstractmethod
from functools import cached_property
from typing import (Final, Iterable, List, Literal, Mapping, Optional,
Protocol, Set, Tuple, TypedDict, TypeVar, Union)
Expand Down Expand Up @@ -82,7 +83,7 @@ def get_num_image_tokens(
hf_config = self.get_hf_config()
vision_encoder_info = self.get_vision_encoder_info()

base_feature_size = self.apply_feature_select_strategy(
base_feature_size = self._apply_feature_select_strategy(
hf_config.vision_feature_select_strategy,
vision_encoder_info.get_num_image_tokens(
image_width=image_width,
Expand All @@ -99,7 +100,7 @@ def get_num_image_tokens(
(
unpadded_feature_size,
newline_feature_size,
) = self.get_num_unpadded_features(
) = self._get_num_unpadded_features(
original_height=image_height,
original_width=image_width,
npatches=vision_encoder_info.get_patch_grid_length(),
Expand All @@ -110,7 +111,7 @@ def get_num_image_tokens(
return unpadded_feature_size + newline_feature_size + base_feature_size

# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
def get_num_unpadded_features(
def _get_num_unpadded_features(
self,
*,
original_height: int,
Expand Down Expand Up @@ -162,6 +163,19 @@ def get_image_size_with_most_features(self) -> ImageSize:

class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):

# Copied from BaseMultiModalProcessor
@abstractmethod
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
raise NotImplementedError


class LlavaNextMultiModalProcessor(
BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):

def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
Expand All @@ -174,11 +188,6 @@ def _get_mm_fields_config(
)


class LlavaNextMultiModalProcessor(
BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
pass


@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
info=LlavaNextProcessingInfo,
dummy=LlavaDummyInputsBuilder)
Expand Down
17 changes: 10 additions & 7 deletions vllm/model_executor/models/llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
max_video_tokens = self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_max_num_frames(seq_len),
num_frames=self.get_num_frames_with_most_features(seq_len),
)

return {"video": max_video_tokens}
Expand All @@ -76,7 +76,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
width = height = vision_encoder_info.get_image_size()
return ImageSize(width=width, height=height)

def get_num_frame_tokens(
def _get_num_frame_tokens(
self,
*,
image_width: int,
Expand All @@ -98,14 +98,14 @@ def get_num_video_tokens(
image_height: int,
num_frames: int,
) -> int:
num_frame_tokens = self.get_num_frame_tokens(
num_frame_tokens = self._get_num_frame_tokens(
image_width=image_width,
image_height=image_height,
)

return num_frame_tokens * num_frames

def get_max_video_frames(self, max_tokens: int) -> int:
def _get_max_video_frames(self, max_tokens: int) -> int:
target_width, target_height = self.get_image_size_with_most_features()

num_frames = 0
Expand All @@ -125,11 +125,11 @@ def get_max_video_frames(self, max_tokens: int) -> int:

return num_frames

def get_max_num_frames(self, seq_len: int) -> int:
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_videos = mm_config.limit_per_prompt.get("video", 1)

max_total_frames = self.get_max_video_frames(seq_len)
max_total_frames = self._get_max_video_frames(seq_len)

return max(max_total_frames // max(max_videos, 1), 1)

Expand All @@ -146,15 +146,18 @@ def get_dummy_processor_inputs(

processor = self.info.get_hf_processor()
video_token = processor.video_token

target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)

mm_data = {
"video":
self._get_dummy_videos(
width=target_width,
height=target_height,
num_frames=self.info.get_max_num_frames(seq_len),
num_frames=target_num_frames,
num_videos=num_videos,
)
}
Expand Down
41 changes: 14 additions & 27 deletions vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
VideoProcessorItems)
from vllm.multimodal.processing import PromptReplacement
from vllm.multimodal.profiling import ProcessorInputs
from vllm.sequence import IntermediateTensors
Expand Down Expand Up @@ -109,7 +109,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:

# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
# with additional logic afterwards taken from LlavaOnevisionProcessor
def get_num_unpadded_features(
def _get_num_unpadded_features(
self,
*,
original_height: int,
Expand Down Expand Up @@ -145,23 +145,7 @@ def get_num_unpadded_features(

return (unpadded_features, newline_features)

def get_image_size_with_most_features(self) -> ImageSize:
hf_config = self.get_hf_config()
largest_feature_size, largest_feature_pinpoint = 0, None
for (height, width) in hf_config.image_grid_pinpoints:
feat_size = self.get_num_image_tokens(image_width=width,
image_height=height)
if feat_size > largest_feature_size:
largest_feature_size = feat_size
largest_feature_pinpoint = ImageSize(width=width,
height=height)

if largest_feature_size == 0 or largest_feature_pinpoint is None:
raise ValueError("Cannot have a largest feature size of 0!")

return largest_feature_pinpoint

def get_num_frame_tokens(
def _get_num_frame_tokens(
self,
*,
image_width: int,
Expand All @@ -183,14 +167,14 @@ def get_num_video_tokens(
image_height: int,
num_frames: int,
) -> int:
num_frame_tokens = self.get_num_frame_tokens(
num_frame_tokens = self._get_num_frame_tokens(
image_width=image_width,
image_height=image_height,
)

return num_frame_tokens * num_frames + 1 # Newline token

def get_max_video_frames(self, max_tokens: int) -> int:
def _get_max_video_frames(self, max_tokens: int) -> int:
target_width, target_height = self.get_image_size_with_most_features()

num_frames = 0
Expand All @@ -210,14 +194,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:

return num_frames

def get_max_num_frames(self, seq_len: int) -> int:
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.limit_per_prompt.get("image", 1)
max_videos = mm_config.limit_per_prompt.get("video", 1)

max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self.get_max_video_frames(seq_len -
max_image_tokens)
max_total_frames = self._get_max_video_frames(seq_len -
max_image_tokens)
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
_MAX_FRAMES_PER_VIDEO)

Expand All @@ -229,7 +213,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
return self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_max_num_frames(seq_len),
num_frames=self.get_num_frames_with_most_features(seq_len),
)


Expand All @@ -247,8 +231,11 @@ def get_dummy_processor_inputs(
processor = self.info.get_hf_processor()
image_token = processor.image_token
video_token = processor.video_token

target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)

mm_data = {
"image":
Expand All @@ -259,7 +246,7 @@ def get_dummy_processor_inputs(
self._get_dummy_videos(
width=target_width,
height=target_height,
num_frames=self.info.get_max_num_frames(seq_len),
num_frames=target_num_frames,
num_videos=num_videos,
)
}
Expand Down
15 changes: 9 additions & 6 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@ def get_max_image_tokens(self) -> int:
image_height=target_height,
)

def get_max_video_frames(self, max_tokens: int) -> int:
def _get_max_video_frames(self, max_tokens: int) -> int:
target_width, target_height = self.get_image_size_with_most_features()

num_frames = 0
Expand All @@ -856,14 +856,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:

return num_frames

def get_max_num_frames(self, seq_len: int) -> int:
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.limit_per_prompt.get("image", 1)
max_videos = mm_config.limit_per_prompt.get("video", 1)

max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self.get_max_video_frames(seq_len -
max_image_tokens)
max_total_frames = self._get_max_video_frames(seq_len -
max_image_tokens)

num_frames = max(max_total_frames // max(max_videos, 1), 1)

Expand All @@ -879,7 +879,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
return self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_max_num_frames(seq_len),
num_frames=self.get_num_frames_with_most_features(seq_len),
)


Expand All @@ -896,8 +896,11 @@ def get_dummy_processor_inputs(
hf_processor = self.info.get_hf_processor()
image_token: str = hf_processor.image_token
video_token: str = hf_processor.video_token

target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)

mm_data = {
"image":
Expand All @@ -908,7 +911,7 @@ def get_dummy_processor_inputs(
self._get_dummy_videos(
width=target_width,
height=target_height,
num_frames=self.info.get_max_num_frames(seq_len),
num_frames=target_num_frames,
num_videos=num_videos,
)
}
Expand Down

0 comments on commit b7e5324

Please sign in to comment.