From 4f942750fdb23b4a59a2ddb95eb844ae442bfa88 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Nov 2024 14:58:49 +0000 Subject: [PATCH 1/3] Rename `MultiModalInputs -> MultiModalKwargs` Signed-off-by: DarkLight1337 --- .../dev/multimodal/multimodal_index.rst | 2 +- .../mm_processor_kwargs/test_qwen.py | 4 ++-- tests/multimodal/test_base.py | 22 +++++++++---------- vllm/model_executor/models/chatglm.py | 4 ++-- vllm/model_executor/models/fuyu.py | 4 ++-- vllm/model_executor/models/h2ovl.py | 10 ++++----- vllm/model_executor/models/internvl.py | 6 ++--- vllm/model_executor/models/minicpmv.py | 4 ++-- vllm/model_executor/models/mllama.py | 2 +- vllm/model_executor/models/molmo.py | 4 ++-- vllm/model_executor/models/pixtral.py | 10 ++++----- vllm/model_executor/models/qwen.py | 12 +++++----- vllm/model_executor/models/qwen2_audio.py | 8 +++---- vllm/model_executor/models/qwen2_vl.py | 8 +++---- vllm/model_executor/models/ultravox.py | 8 +++---- vllm/multimodal/__init__.py | 4 ++-- vllm/multimodal/audio.py | 4 ++-- vllm/multimodal/base.py | 20 ++++++++--------- vllm/multimodal/image.py | 10 ++++----- vllm/multimodal/registry.py | 6 ++--- vllm/multimodal/video.py | 6 ++--- vllm/spec_decode/draft_model_runner.py | 4 ++-- vllm/worker/cpu_enc_dec_model_runner.py | 4 ++-- vllm/worker/cpu_model_runner.py | 10 ++++----- vllm/worker/embedding_model_runner.py | 4 ++-- vllm/worker/enc_dec_model_runner.py | 4 ++-- vllm/worker/model_runner.py | 18 +++++++-------- vllm/worker/neuron_model_runner.py | 10 ++++----- vllm/worker/openvino_model_runner.py | 10 ++++----- vllm/worker/xpu_model_runner.py | 10 ++++----- 30 files changed, 116 insertions(+), 116 deletions(-) diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst index e112b43aade5e..30f543abc20c7 100644 --- a/docs/source/dev/multimodal/multimodal_index.rst +++ b/docs/source/dev/multimodal/multimodal_index.rst @@ -53,7 +53,7 @@ Base Classes .. autodata:: vllm.multimodal.MultiModalDataDict -.. autoclass:: vllm.multimodal.MultiModalInputs +.. autoclass:: vllm.multimodal.MultiModalKwargs :members: :show-inheritance: diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py index a01651b171d60..2ef076de9ba74 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -6,7 +6,7 @@ from PIL.Image import Image from vllm.inputs import InputContext, token_inputs -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from .....conftest import IMAGE_ASSETS @@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) # Ensure that we get the appropriately shaped pixel_values # for images and image embeddings, respectively. - assert isinstance(mapped_img_data, MultiModalInputs) + assert isinstance(mapped_img_data, MultiModalKwargs) assert "pixel_values" in mapped_img_data assert mapped_img_data["pixel_values"].shape == expected_shape diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_base.py index 68d05de904ba8..bfaf2cdeaa8d4 100644 --- a/tests/multimodal/test_base.py +++ b/tests/multimodal/test_base.py @@ -1,6 +1,6 @@ import torch -from vllm.multimodal.base import MultiModalInputs, NestedTensors +from vllm.multimodal.base import MultiModalKwargs, NestedTensors def assert_nested_tensors_equal(expected: NestedTensors, @@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors, assert_nested_tensors_equal(expected_item, actual_item) -def assert_multimodal_inputs_equal(expected: MultiModalInputs, - actual: MultiModalInputs): +def assert_multimodal_inputs_equal(expected: MultiModalKwargs, + actual: MultiModalKwargs): assert set(expected.keys()) == set(actual.keys()) for key in expected: assert_nested_tensors_equal(expected[key], actual[key]) @@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs, def test_multimodal_input_batch_single_tensor(): t = torch.rand([1, 2]) - result = MultiModalInputs.batch([{"image": t}]) + result = MultiModalKwargs.batch([{"image": t}]) assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)}) @@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors(): a = torch.rand([1, 1, 2]) b = torch.rand([1, 1, 2]) c = torch.rand([1, 1, 2]) - result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) + result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])}) @@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors(): a = torch.rand([1, 2, 2]) b = torch.rand([1, 3, 2]) c = torch.rand([1, 4, 2]) - result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) + result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) assert_multimodal_inputs_equal(result, {"image": [a, b, c]}) @@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors(): a = torch.rand([2, 3]) b = torch.rand([2, 3]) c = torch.rand([2, 3]) - result = MultiModalInputs.batch([{ + result = MultiModalKwargs.batch([{ "image": [a] }, { "image": [b] @@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists(): a = torch.rand([1, 2, 3]) b = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) assert_multimodal_inputs_equal( result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}) @@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists(): b = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3]) d = torch.rand([1, 2, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}]) assert_multimodal_inputs_equal( result, {"image": torch.stack([torch.stack([a, b]), @@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths(): b = torch.rand([1, 3, 3]) c = torch.rand([1, 4, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]}) - result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}]) + result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}]) assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]}) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c3c9ec703c1e6..d0c5486ca5ac5 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -31,7 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.base import MultiModalData from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv( raise pixel_values = raw_batch_data['images'] - return MultiModalInputs({'pixel_values': pixel_values}) + return MultiModalKwargs({'pixel_values': pixel_values}) def merge_glm_vision_embeddings( diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 0de590d1d8372..1975d8a570f9a 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) @@ -219,7 +219,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): ]) # image has been processed with prompt in input processor - return MultiModalInputs({"pixel_values": data}) + return MultiModalKwargs({"pixel_values": data}) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 43242fe370ba2..767171dad7c7b 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -16,7 +16,7 @@ token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.utils import is_list_of @@ -324,12 +324,12 @@ def input_mapper( data: object, *, max_dynamic_patch: Optional[int] = None, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: # NOTE: Preprocessing for the image data is done in the # 'input_processor' function during actual inference. if isinstance(data, dict): - return MultiModalInputs(data) + return MultiModalKwargs(data) # The section below is only used with dummy data during # memory profiling. @@ -347,7 +347,7 @@ def input_mapper( pixel_values = [image_pixel_values_mapper(img) for img in data] else: - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer, @@ -359,7 +359,7 @@ def input_mapper( return_tensors="pt", )[0] - return MultiModalInputs({ + return MultiModalKwargs({ "pixel_values": pixel_values, "image_token_id": image_token_id }) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index d2ec0ff6e74c6..dfa869c1f5531 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -26,7 +26,7 @@ InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -346,7 +346,7 @@ def input_mapper( # we can't stack here because images may have different num_patches data = [image_pixel_values_mapper(img) for img in data] else: - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer, @@ -355,7 +355,7 @@ def input_mapper( add_special_tokens=False, return_tensors="pt")[0] - return MultiModalInputs({ + return MultiModalKwargs({ "pixel_values": data, "image_token_id": image_token_id }) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index f90df6b7df036..6ad04631f208e 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -53,7 +53,7 @@ from vllm.model_executor.models.utils import LLMWrapper from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData @@ -375,7 +375,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object): batch_data["slice_start_id"] = data[0]["slice_start_id"] batch_data["slice_end_id"] = data[0]["slice_end_id"] - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 251bfc079684e..f7d2889a6fa5b 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1163,7 +1163,7 @@ def sample( def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by - # MultiModalInputs.batch, so pixel_values here can be: + # MultiModalKwargs.batch, so pixel_values here can be: # - List[List[torch.Tensor]]: # with shape (num_tiles, 3, image_res, image_res) # - List[torch.Tensor]: diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ba798833e26a9..3aa68893de2d1 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -37,7 +37,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -865,7 +865,7 @@ def image_input_mapper_for_molmo( ctx: InputContext, data: object, ): - return MultiModalInputs(data) + return MultiModalKwargs(data) def dummy_data_for_molmo(ctx: InputContext, seq_len: int, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 051454c49bff8..2e2683646ffdf 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -27,7 +27,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData @@ -91,8 +91,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, def input_mapper_for_pixtral(ctx: InputContext, - data: object) -> MultiModalInputs: - """Maps the input data to its MultiModalInputs (if any). + data: object) -> MultiModalKwargs: + """Maps the input data to its MultiModalKwargs (if any). Args: ctx: Context of the loaded model. @@ -100,7 +100,7 @@ def input_mapper_for_pixtral(ctx: InputContext, to pixel_values in .forward() for a visual QWenLMHeadModel model. Returns: - MultiModalInputs containing the stacked normalized images tensor or + MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ # Early exit if we have provided an image to a language only Qwen model @@ -118,7 +118,7 @@ def input_mapper_for_pixtral(ctx: InputContext, dtype=torch.float16) images.append(image) - return MultiModalInputs({"images": images}) + return MultiModalKwargs({"images": images}) def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b2b5c70182135..a53ddf12c8f5d 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -44,7 +44,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData from vllm.utils import is_list_of @@ -723,8 +723,8 @@ def input_processor_for_qwen(ctx: InputContext, multi_modal_data=multi_modal_data) -def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: - """Maps the input data to its MultiModalInputs (if any). +def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs: + """Maps the input data to its MultiModalKwargs (if any). Args: ctx: Context of the loaded model. @@ -732,7 +732,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: to pixel_values in .forward() for a visual QWenLMHeadModel model. Returns: - MultiModalInputs containing the stacked normalized images tensor or + MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ # Early exit if we have provided an image to a language only Qwen model @@ -741,7 +741,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: logger.warning( "Images were provided but this model has no visual config; " "multimodal inputs will not be forwarded to the model.") - return MultiModalInputs() + return MultiModalKwargs() model_config = ctx.model_config tokenizer = cached_get_tokenizer( @@ -785,7 +785,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: data = [data] transformed_images = [transform(datum) for datum in data] pixel_values = torch.stack(transformed_images, dim=0) - return MultiModalInputs({"pixel_values": pixel_values}) + return MultiModalKwargs({"pixel_values": pixel_values}) def build_normalization_transform(image_size: int) -> transforms.Compose: diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 6114548bda42c..c6edcf0071118 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -43,7 +43,7 @@ default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData @@ -222,13 +222,13 @@ def input_processor_for_qwen2_audio( def input_mapper_for_qwen2_audio( ctx: InputContext, multi_modal_data: Union[np.ndarray, List[np.ndarray]], -) -> MultiModalInputs: +) -> MultiModalKwargs: """Input mapper for Qwen2-Audio.""" if not isinstance(multi_modal_data, list): multi_modal_data = [multi_modal_data] if len(multi_modal_data) == 0: - return MultiModalInputs() + return MultiModalKwargs() processor = cached_get_processor(ctx.model_config.model) audio_feature_extractor = processor.feature_extractor @@ -255,7 +255,7 @@ def input_mapper_for_qwen2_audio( logger.error("Failed to process audio (%s)", multi_modal_data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d801903f8f9fe..5ed57ce2f2f9d 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -58,7 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalInputs) + MultiModalKwargs) from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer @@ -567,10 +567,10 @@ def mm_input_mapper_for_qwen2_vl( *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, -) -> MultiModalInputs: +) -> MultiModalKwargs: """Input mapper for Qwen2-VL.""" if data_type_key == "image" and isinstance(data, dict): - return MultiModalInputs({ + return MultiModalKwargs({ "image_embeds": data.get("image_embeds"), "image_grid_thw": data.get("image_grid_thw"), }) @@ -608,7 +608,7 @@ def mm_input_mapper_for_qwen2_vl( logger.error("Failed to process image (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 749750fc9c16e..13dfb21352529 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs, +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, NestedTensors) from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, @@ -116,11 +116,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): data = [data] if len(data) == 0: - return MultiModalInputs() + return MultiModalKwargs() # If the audio inputs are embeddings, no need for preprocessing if is_list_of(data, torch.Tensor, check="all"): - return MultiModalInputs({"audio_embeds": data}) + return MultiModalKwargs({"audio_embeds": data}) audio_features = [] for audio_input in data: @@ -154,7 +154,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): # Remove the batch dimension because we're wrapping it in a list. audio_features.append(single_audio_features.squeeze(0)) - return MultiModalInputs({"audio_features": audio_features}) + return MultiModalKwargs({"audio_features": audio_features}) def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 53da2badb9b98..f3cafb9efdc49 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,5 +1,5 @@ from .base import (BatchedTensorInputs, MultiModalDataBuiltins, - MultiModalDataDict, MultiModalInputs, + MultiModalDataDict, MultiModalKwargs, MultiModalPlaceholderDict, MultiModalPlaceholderMap, MultiModalPlugin, NestedTensors) from .registry import MultiModalRegistry @@ -17,7 +17,7 @@ "BatchedTensorInputs", "MultiModalDataBuiltins", "MultiModalDataDict", - "MultiModalInputs", + "MultiModalKwargs", "MultiModalPlaceholderDict", "MultiModalPlaceholderMap", "MultiModalPlugin", diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 04d71826f29fa..e71ae5feec1c6 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,5 +1,5 @@ from vllm.inputs.registry import InputContext -from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin +from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin class AudioPlugin(MultiModalPlugin): @@ -9,7 +9,7 @@ def get_data_key(self) -> str: return "audio" def _default_input_mapper(self, ctx: InputContext, data: object, - **mm_processor_kwargs) -> MultiModalInputs: + **mm_processor_kwargs) -> MultiModalKwargs: raise NotImplementedError("There is no default audio input mapper") def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 6b10d0c609f13..638a7dfa5021a 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -31,20 +31,20 @@ BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via -:meth:`MultiModalInputs.batch`. +:meth:`MultiModalKwargs.batch`. """ if sys.version_info < (3, 9): # UserDict cannot be subscripted - class _MultiModalInputsBase(UserDict): + class _MultiModalKwargsBase(UserDict): pass else: - class _MultiModalInputsBase(UserDict[str, NestedTensors]): + class _MultiModalKwargsBase(UserDict[str, NestedTensors]): pass -class MultiModalInputs(_MultiModalInputsBase): +class MultiModalKwargs(_MultiModalKwargsBase): """ A dictionary that represents the keyword arguments to :meth:`~torch.nn.Module.forward`. @@ -64,7 +64,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: if isinstance(nested_tensors, (int, float)): return torch.tensor(nested_tensors) - stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors] + stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] if not is_list_of(stacked, torch.Tensor, check="all"): # Only tensors (not lists) can be stacked. return stacked @@ -77,7 +77,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: return torch.stack(tensors_) @staticmethod - def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: + def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: """ Batch multiple inputs together into a dictionary. @@ -101,7 +101,7 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: item_lists[k].append(v) return { - k: MultiModalInputs._try_stack(item_list) + k: MultiModalKwargs._try_stack(item_list) for k, item_list in item_lists.items() } @@ -180,7 +180,7 @@ class PlaceholderRange(TypedDict): """ MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], - MultiModalInputs] + MultiModalKwargs] """ Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers @@ -229,7 +229,7 @@ def _default_input_mapper( ctx: InputContext, data: MultiModalData[object], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: """ Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to @@ -273,7 +273,7 @@ def wrapper(model_cls: N) -> N: def map_input(self, model_config: "ModelConfig", data: MultiModalData[object], - mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs: + mm_processor_kwargs: Dict[str, Any]) -> MultiModalKwargs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 3f6bb6c8338d2..589b46266b08d 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -10,7 +10,7 @@ from vllm.transformers_utils.processor import get_image_processor from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalInputs, MultiModalPlugin +from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin if TYPE_CHECKING: from vllm.config import ModelConfig @@ -43,12 +43,12 @@ def _default_input_mapper( ctx: InputContext, data: MultiModalData[object], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: model_config = ctx.model_config # Processed by input processor if isinstance(data, BatchFeature): - return MultiModalInputs(data.data) + return MultiModalKwargs(data.data) # PIL image if isinstance(data, Image.Image) or is_list_of(data, Image.Image): @@ -78,11 +78,11 @@ def _default_input_mapper( type(image_processor).__name__) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) # Image embedding elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor): - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) raise TypeError(f"Invalid image type: {type(data)}") diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index bce2f4c6abe5b..b844c9e1c2e89 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -5,7 +5,7 @@ from vllm.logger import init_logger from .audio import AudioPlugin -from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs, +from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalKwargs, MultiModalPlugin, MultiModalTokensCalc, NestedTensors) from .image import ImagePlugin from .video import VideoPlugin @@ -103,7 +103,7 @@ def map_input( model_config: "ModelConfig", data: MultiModalDataDict, mm_processor_kwargs: Optional[Dict[str, Any]] = None, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: """ Apply an input mapper to the data passed to the model. @@ -139,7 +139,7 @@ def map_input( merged_dict[input_key] = input_tensor - return MultiModalInputs(merged_dict) + return MultiModalKwargs(merged_dict) def create_input_mapper(self, model_config: "ModelConfig"): """ diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 6c2c6720f4276..27487d9d5946e 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -8,7 +8,7 @@ from vllm.transformers_utils.processor import get_video_processor from vllm.transformers_utils.tokenizer import get_tokenizer -from .base import MultiModalData, MultiModalInputs +from .base import MultiModalData, MultiModalKwargs from .image import ImagePlugin if TYPE_CHECKING: @@ -54,7 +54,7 @@ def _default_input_mapper( ctx: InputContext, data: MultiModalData[object], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: model_config = ctx.model_config if isinstance(data, list) and len(data) == 1: @@ -78,7 +78,7 @@ def _default_input_mapper( logger.error("Failed to process video (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) raise TypeError(f"Invalid video type: {type(data)}") diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 17cc0ad1a4a3a..13d65d8304d19 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -18,7 +18,7 @@ "CUDA and ROCm flash attention backend.") from err from vllm.logger import init_logger -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, ModelRunner) @@ -274,7 +274,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **kwargs, ) diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 8ebbf6db939bc..994af7c5a455f 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -5,7 +5,7 @@ from vllm.attention import AttentionMetadata from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad from vllm.worker.cpu_model_runner import (CPUModelRunner, @@ -287,7 +287,7 @@ def execute_model( kv_caches, "attn_metadata": model_input.attn_metadata, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index fdd72a452f2ad..8826f756945a0 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -15,7 +15,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap) + MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) from vllm.transformers_utils.config import uses_mrope @@ -201,7 +201,7 @@ def _prepare_prompt( slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -226,7 +226,7 @@ def _prepare_prompt( ._compute_multi_modal_input( seq_group_metadata, seq_data, computed_len, seq_group_metadata.mm_processor_kwargs) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( placeholder_map) @@ -298,7 +298,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -527,7 +527,7 @@ def execute_model( kv_caches, "attn_metadata": model_input.attn_metadata, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index ff288d5ca1512..37cfcbf13d7a3 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -8,7 +8,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.pooling_params import PoolingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, SequenceGroupMetadata) @@ -104,7 +104,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device)) if (self.observability_config is not None diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 90a43196084ea..008e0c9745994 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -18,7 +18,7 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.utils import get_architecture_class_name -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs, +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, @@ -206,7 +206,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **seqlen_agnostic_kwargs) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 328dab598f8ef..b6b9267e25f4d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -38,7 +38,7 @@ from vllm.model_executor.models import supports_lora, supports_multimodal from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap, + MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) from vllm.platforms import current_platform from vllm.prompt_adapter.layers import PromptAdapterMapping @@ -240,7 +240,7 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. - multi_modal_inputs: Optional[MultiModalInputs] = None, + multi_model_kwargs: Optional[MultiModalKwargs] = None, multi_modal_placeholder_maps: Optional[Dict[ str, MultiModalPlaceholderMap]] = None, @@ -361,7 +361,7 @@ def __init__( prompt_adapter_prompt_mapping or []) self.prompt_adapter_request = prompt_adapter_request - self.multi_modal_inputs = multi_modal_inputs + self.multi_model_kwargs = multi_model_kwargs self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.prefix_cache_hit = prefix_cache_hit @@ -649,7 +649,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, mm_kwargs = self.multi_modal_input_mapper( mm_data, mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs) - inter_data.multi_modal_inputs = mm_kwargs + inter_data.multi_model_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps # special processing for mrope position deltas. @@ -923,11 +923,11 @@ def build(self) -> ModelInputForGPU: ) # Multi-modal data. - multi_modal_inputs_list = [ - data.multi_modal_inputs for data in self.inter_data_list - if data.multi_modal_inputs is not None + multi_model_kwargs_list = [ + data.multi_model_kwargs for data in self.inter_data_list + if data.multi_model_kwargs is not None ] - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return self.model_input_cls( input_tokens=input_tokens_tensor, @@ -1640,7 +1640,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **seqlen_agnostic_kwargs) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 2da22cbfc7cb5..0ed33e435aa2f 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs) + MultiModalKwargs) from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase @@ -122,7 +122,7 @@ def _prepare_prompt( input_block_ids: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -149,7 +149,7 @@ def _prepare_prompt( mm_data, mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs, ) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) max_seq_len = max(seq_lens) assert max_seq_len > 0 @@ -167,7 +167,7 @@ def _prepare_prompt( dtype=torch.long, device=self.device) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return (input_tokens, input_positions, input_block_ids, seq_lens, multi_modal_kwargs) @@ -314,7 +314,7 @@ def execute_model( input_ids=model_input.input_tokens, positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), ) diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index c9c87ea748081..378e1e06039b2 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.openvino import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap) + MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import SequenceGroupMetadata from vllm.worker.model_runner_base import ModelRunnerBase @@ -102,7 +102,7 @@ def _prepare_model_input( seq_lens: List[int] = [] past_lens: List[int] = [] query_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -226,7 +226,7 @@ def _prepare_model_input( mm_data, mm_processor_kwargs=seq_group_metadata. mm_processor_kwargs) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -275,7 +275,7 @@ def _prepare_model_input( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return ModelInput( input_tokens, @@ -341,7 +341,7 @@ def execute_model( kv_caches, "attn_metadata": attn_metadata, - **MultiModalInputs.as_kwargs(multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {}, device=self.device), } diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index bae8b469767b2..c9e637c057979 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -18,7 +18,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap, + MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata @@ -160,7 +160,7 @@ def _prepare_prompt( input_positions: List[int] = [] slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -192,7 +192,7 @@ def _prepare_prompt( .from_seq_group(seq_group_metadata, positions_range) mm_kwargs = self.runner.multi_modal_input_mapper(mm_data) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -264,7 +264,7 @@ def _prepare_prompt( block_tables=torch.tensor([], device=self.device, dtype=torch.int), ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -565,7 +565,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device)) # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: From 8e9f9d0554b425b6ff550a4db8e6a30760f439c8 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Nov 2024 14:59:16 +0000 Subject: [PATCH 2/3] Add deprecation warning Signed-off-by: DarkLight1337 --- vllm/multimodal/__init__.py | 15 +++++++++++++++ vllm/multimodal/base.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index f3cafb9efdc49..14911853abc73 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -25,3 +25,18 @@ "MULTIMODAL_REGISTRY", "MultiModalRegistry", ] + + +def __getattr__(name: str): + import warnings + + if name == "MultiModalInputs": + msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " + "The original name will take another meaning in an upcoming " + "version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return MultiModalKwargs + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 638a7dfa5021a..4eea2418cf8e2 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -570,3 +570,18 @@ def index_map(self) -> "IndexMap": return MultiModalPlaceholderMap.IndexMap(src=src_indices, dest=dest_indices) + + +def __getattr__(name: str): + import warnings + + if name == "MultiModalInputs": + msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " + "The original name will take another meaning in an upcoming " + "version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return MultiModalKwargs + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") From 6034d20352f3a64145adbf99f0cd2408d387e803 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 8 Nov 2024 06:16:34 +0000 Subject: [PATCH 3/3] Update Signed-off-by: DarkLight1337 --- vllm/model_executor/models/idefics3.py | 4 ++-- vllm/worker/hpu_model_runner.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index e4c98f22fb16f..db3771dc11e51 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -35,7 +35,7 @@ from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.processor import cached_get_processor @@ -101,7 +101,7 @@ def input_mapper_for_idefics3( logger.error("Failed to process image (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) def _resize_output_size(height: int, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 7e9b2bd13b48a..92d6552b2f428 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs) + MultiModalKwargs) from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) @@ -716,7 +716,7 @@ def _prepare_prompt( context_lens: List[int] = [] query_lens: List[int] = [] prefix_block_tables: List[List[int]] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -777,7 +777,7 @@ def _prepare_prompt( mm_data = seq_group_metadata.multi_modal_data if mm_data: mm_kwargs = self.multi_modal_input_mapper(mm_data) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -876,7 +876,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps= None # FIXME(kzawora): mutli-modality will not work here ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return PreparePromptMetadata(input_tokens=input_tokens, input_positions=input_positions,