diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst index e112b43aade5e..30f543abc20c7 100644 --- a/docs/source/design/multimodal/multimodal_index.rst +++ b/docs/source/design/multimodal/multimodal_index.rst @@ -53,7 +53,7 @@ Base Classes .. autodata:: vllm.multimodal.MultiModalDataDict -.. autoclass:: vllm.multimodal.MultiModalInputs +.. autoclass:: vllm.multimodal.MultiModalKwargs :members: :show-inheritance: diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py index 6ae8a6a704b0a..e6ed87fc8ea08 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -6,7 +6,7 @@ from PIL.Image import Image from vllm.inputs import InputContext, token_inputs -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from .....conftest import IMAGE_ASSETS @@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) # Ensure that we get the appropriately shaped pixel_values # for images and image embeddings, respectively. - assert isinstance(mapped_img_data, MultiModalInputs) + assert isinstance(mapped_img_data, MultiModalKwargs) assert "pixel_values" in mapped_img_data assert mapped_img_data["pixel_values"].shape == expected_shape diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_base.py index 68d05de904ba8..bfaf2cdeaa8d4 100644 --- a/tests/multimodal/test_base.py +++ b/tests/multimodal/test_base.py @@ -1,6 +1,6 @@ import torch -from vllm.multimodal.base import MultiModalInputs, NestedTensors +from vllm.multimodal.base import MultiModalKwargs, NestedTensors def assert_nested_tensors_equal(expected: NestedTensors, @@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors, assert_nested_tensors_equal(expected_item, actual_item) -def assert_multimodal_inputs_equal(expected: MultiModalInputs, - actual: MultiModalInputs): +def assert_multimodal_inputs_equal(expected: MultiModalKwargs, + actual: MultiModalKwargs): assert set(expected.keys()) == set(actual.keys()) for key in expected: assert_nested_tensors_equal(expected[key], actual[key]) @@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs, def test_multimodal_input_batch_single_tensor(): t = torch.rand([1, 2]) - result = MultiModalInputs.batch([{"image": t}]) + result = MultiModalKwargs.batch([{"image": t}]) assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)}) @@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors(): a = torch.rand([1, 1, 2]) b = torch.rand([1, 1, 2]) c = torch.rand([1, 1, 2]) - result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) + result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])}) @@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors(): a = torch.rand([1, 2, 2]) b = torch.rand([1, 3, 2]) c = torch.rand([1, 4, 2]) - result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) + result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) assert_multimodal_inputs_equal(result, {"image": [a, b, c]}) @@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors(): a = torch.rand([2, 3]) b = torch.rand([2, 3]) c = torch.rand([2, 3]) - result = MultiModalInputs.batch([{ + result = MultiModalKwargs.batch([{ "image": [a] }, { "image": [b] @@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists(): a = torch.rand([1, 2, 3]) b = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) assert_multimodal_inputs_equal( result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}) @@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists(): b = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3]) d = torch.rand([1, 2, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}]) assert_multimodal_inputs_equal( result, {"image": torch.stack([torch.stack([a, b]), @@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths(): b = torch.rand([1, 3, 3]) c = torch.rand([1, 4, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]}) - result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}]) + result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}]) assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]}) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 032fa82ab93cd..eb9c3e3ae785d 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -30,7 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.base import MultiModalData from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv( raise pixel_values = raw_batch_data['images'] - return MultiModalInputs({'pixel_values': pixel_values}) + return MultiModalKwargs({'pixel_values': pixel_values}) def merge_glm_vision_embeddings( diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 3db82a898159b..653d5d60ea178 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -34,7 +34,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) @@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): ]) # image has been processed with prompt in input processor - return MultiModalInputs({"pixel_values": data}) + return MultiModalKwargs({"pixel_values": data}) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 43242fe370ba2..767171dad7c7b 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -16,7 +16,7 @@ token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.utils import is_list_of @@ -324,12 +324,12 @@ def input_mapper( data: object, *, max_dynamic_patch: Optional[int] = None, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: # NOTE: Preprocessing for the image data is done in the # 'input_processor' function during actual inference. if isinstance(data, dict): - return MultiModalInputs(data) + return MultiModalKwargs(data) # The section below is only used with dummy data during # memory profiling. @@ -347,7 +347,7 @@ def input_mapper( pixel_values = [image_pixel_values_mapper(img) for img in data] else: - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer, @@ -359,7 +359,7 @@ def input_mapper( return_tensors="pt", )[0] - return MultiModalInputs({ + return MultiModalKwargs({ "pixel_values": pixel_values, "image_token_id": image_token_id }) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index e4c98f22fb16f..db3771dc11e51 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -35,7 +35,7 @@ from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.processor import cached_get_processor @@ -101,7 +101,7 @@ def input_mapper_for_idefics3( logger.error("Failed to process image (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) def _resize_output_size(height: int, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index bb9d38889a175..335b11d293acd 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -26,7 +26,7 @@ InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -346,7 +346,7 @@ def input_mapper( # we can't stack here because images may have different num_patches data = [image_pixel_values_mapper(img) for img in data] else: - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer, @@ -355,7 +355,7 @@ def input_mapper( add_special_tokens=False, return_tensors="pt")[0] - return MultiModalInputs({ + return MultiModalKwargs({ "pixel_values": data, "image_token_id": image_token_id }) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 4ffe33bb6ce41..f8006095e2eb2 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -52,7 +52,7 @@ from vllm.model_executor.models.utils import LLMWrapper from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData @@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object): batch_data["slice_start_id"] = data[0]["slice_start_id"] batch_data["slice_end_id"] = data[0]["slice_end_id"] - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index d442ffe3c1fb1..18e38daadc93a 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1162,7 +1162,7 @@ def sample( def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by - # MultiModalInputs.batch, so pixel_values here can be: + # MultiModalKwargs.batch, so pixel_values here can be: # - List[List[torch.Tensor]]: # with shape (num_tiles, 3, image_res, image_res) # - List[torch.Tensor]: diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 3a50923de3741..5f2f61cc610b3 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -37,7 +37,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -866,7 +866,7 @@ def image_input_mapper_for_molmo( ctx: InputContext, data: object, ): - return MultiModalInputs(data) + return MultiModalKwargs(data) def dummy_data_for_molmo(ctx: InputContext, seq_len: int, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index facf1969b9479..de935fc420472 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -30,7 +30,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData @@ -94,8 +94,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, def input_mapper_for_pixtral(ctx: InputContext, - data: object) -> MultiModalInputs: - """Maps the input data to its MultiModalInputs (if any). + data: object) -> MultiModalKwargs: + """Maps the input data to its MultiModalKwargs (if any). Args: ctx: Context of the loaded model. @@ -103,7 +103,7 @@ def input_mapper_for_pixtral(ctx: InputContext, to pixel_values in .forward() for a visual QWenLMHeadModel model. Returns: - MultiModalInputs containing the stacked normalized images tensor or + MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ # Early exit if we have provided an image to a language only Qwen model @@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext, dtype=torch.float16) images.append(image) - return MultiModalInputs({"images": images}) + return MultiModalKwargs({"images": images}) def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index c91c2caa3d519..1db7e2ba1cc12 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -43,7 +43,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData from vllm.utils import is_list_of @@ -722,8 +722,8 @@ def input_processor_for_qwen(ctx: InputContext, multi_modal_data=multi_modal_data) -def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: - """Maps the input data to its MultiModalInputs (if any). +def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs: + """Maps the input data to its MultiModalKwargs (if any). Args: ctx: Context of the loaded model. @@ -731,7 +731,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: to pixel_values in .forward() for a visual QWenLMHeadModel model. Returns: - MultiModalInputs containing the stacked normalized images tensor or + MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ # Early exit if we have provided an image to a language only Qwen model @@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: logger.warning( "Images were provided but this model has no visual config; " "multimodal inputs will not be forwarded to the model.") - return MultiModalInputs() + return MultiModalKwargs() model_config = ctx.model_config tokenizer = cached_get_tokenizer( @@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: data = [data] transformed_images = [transform(datum) for datum in data] pixel_values = torch.stack(transformed_images, dim=0) - return MultiModalInputs({"pixel_values": pixel_values}) + return MultiModalKwargs({"pixel_values": pixel_values}) def build_normalization_transform(image_size: int) -> transforms.Compose: diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 54a7085f69ba9..18cf45b3939f7 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -42,7 +42,7 @@ default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData @@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio( def input_mapper_for_qwen2_audio( ctx: InputContext, multi_modal_data: Union[np.ndarray, List[np.ndarray]], -) -> MultiModalInputs: +) -> MultiModalKwargs: """Input mapper for Qwen2-Audio.""" if not isinstance(multi_modal_data, list): multi_modal_data = [multi_modal_data] if len(multi_modal_data) == 0: - return MultiModalInputs() + return MultiModalKwargs() processor = cached_get_processor(ctx.model_config.model) audio_feature_extractor = processor.feature_extractor @@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio( logger.error("Failed to process audio (%s)", multi_modal_data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0e820cf123139..8073c5f4b2fd2 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -57,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalInputs) + MultiModalKwargs) from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer @@ -576,10 +576,10 @@ def mm_input_mapper_for_qwen2_vl( *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, -) -> MultiModalInputs: +) -> MultiModalKwargs: """Input mapper for Qwen2-VL.""" if data_type_key == "image" and isinstance(data, dict): - return MultiModalInputs({ + return MultiModalKwargs({ "image_embeds": data.get("image_embeds"), "image_grid_thw": data.get("image_grid_thw"), }) @@ -613,7 +613,7 @@ def mm_input_mapper_for_qwen2_vl( logger.error("Failed to process image (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 3a343986a9345..1b870f34deed2 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs, +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, NestedTensors) from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, @@ -116,11 +116,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): data = [data] if len(data) == 0: - return MultiModalInputs() + return MultiModalKwargs() # If the audio inputs are embeddings, no need for preprocessing if is_list_of(data, torch.Tensor, check="all"): - return MultiModalInputs({"audio_embeds": data}) + return MultiModalKwargs({"audio_embeds": data}) audio_features = [] for audio_input in data: @@ -154,7 +154,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): # Remove the batch dimension because we're wrapping it in a list. audio_features.append(single_audio_features.squeeze(0)) - return MultiModalInputs({"audio_features": audio_features}) + return MultiModalKwargs({"audio_features": audio_features}) def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 53da2badb9b98..14911853abc73 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,5 +1,5 @@ from .base import (BatchedTensorInputs, MultiModalDataBuiltins, - MultiModalDataDict, MultiModalInputs, + MultiModalDataDict, MultiModalKwargs, MultiModalPlaceholderDict, MultiModalPlaceholderMap, MultiModalPlugin, NestedTensors) from .registry import MultiModalRegistry @@ -17,7 +17,7 @@ "BatchedTensorInputs", "MultiModalDataBuiltins", "MultiModalDataDict", - "MultiModalInputs", + "MultiModalKwargs", "MultiModalPlaceholderDict", "MultiModalPlaceholderMap", "MultiModalPlugin", @@ -25,3 +25,18 @@ "MULTIMODAL_REGISTRY", "MultiModalRegistry", ] + + +def __getattr__(name: str): + import warnings + + if name == "MultiModalInputs": + msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " + "The original name will take another meaning in an upcoming " + "version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return MultiModalKwargs + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 04d71826f29fa..e71ae5feec1c6 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,5 +1,5 @@ from vllm.inputs.registry import InputContext -from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin +from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin class AudioPlugin(MultiModalPlugin): @@ -9,7 +9,7 @@ def get_data_key(self) -> str: return "audio" def _default_input_mapper(self, ctx: InputContext, data: object, - **mm_processor_kwargs) -> MultiModalInputs: + **mm_processor_kwargs) -> MultiModalKwargs: raise NotImplementedError("There is no default audio input mapper") def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 26c94cf2d0b20..fa514d3fcb3b7 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -30,15 +30,15 @@ BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via -:meth:`MultiModalInputs.batch`. +:meth:`MultiModalKwargs.batch`. """ -class _MultiModalInputsBase(UserDict[str, NestedTensors]): +class _MultiModalKwargsBase(UserDict[str, NestedTensors]): pass -class MultiModalInputs(_MultiModalInputsBase): +class MultiModalKwargs(_MultiModalKwargsBase): """ A dictionary that represents the keyword arguments to :meth:`~torch.nn.Module.forward`. @@ -58,7 +58,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: if isinstance(nested_tensors, (int, float)): return torch.tensor(nested_tensors) - stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors] + stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] if not is_list_of(stacked, torch.Tensor, check="all"): # Only tensors (not lists) can be stacked. return stacked @@ -71,7 +71,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: return torch.stack(tensors_) @staticmethod - def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: + def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: """ Batch multiple inputs together into a dictionary. @@ -95,7 +95,7 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: item_lists[k].append(v) return { - k: MultiModalInputs._try_stack(item_list) + k: MultiModalKwargs._try_stack(item_list) for k, item_list in item_lists.items() } @@ -177,7 +177,7 @@ class PlaceholderRange(TypedDict): """ MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], - MultiModalInputs] + MultiModalKwargs] """ Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers @@ -226,7 +226,7 @@ def _default_input_mapper( ctx: InputContext, data: MultiModalData[object], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: """ Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to @@ -275,7 +275,7 @@ def map_input( model_config: "ModelConfig", data: MultiModalData[object], mm_processor_kwargs: Dict[str, Any], - ) -> MultiModalInputs: + ) -> MultiModalKwargs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. @@ -585,3 +585,18 @@ def index_map(self) -> "IndexMap": return MultiModalPlaceholderMap.IndexMap(src=src_indices, dest=dest_indices) + + +def __getattr__(name: str): + import warnings + + if name == "MultiModalInputs": + msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " + "The original name will take another meaning in an upcoming " + "version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return MultiModalKwargs + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 3f6bb6c8338d2..589b46266b08d 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -10,7 +10,7 @@ from vllm.transformers_utils.processor import get_image_processor from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalInputs, MultiModalPlugin +from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin if TYPE_CHECKING: from vllm.config import ModelConfig @@ -43,12 +43,12 @@ def _default_input_mapper( ctx: InputContext, data: MultiModalData[object], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: model_config = ctx.model_config # Processed by input processor if isinstance(data, BatchFeature): - return MultiModalInputs(data.data) + return MultiModalKwargs(data.data) # PIL image if isinstance(data, Image.Image) or is_list_of(data, Image.Image): @@ -78,11 +78,11 @@ def _default_input_mapper( type(image_processor).__name__) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) # Image embedding elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor): - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) raise TypeError(f"Invalid image type: {type(data)}") diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index bce2f4c6abe5b..b844c9e1c2e89 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -5,7 +5,7 @@ from vllm.logger import init_logger from .audio import AudioPlugin -from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs, +from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalKwargs, MultiModalPlugin, MultiModalTokensCalc, NestedTensors) from .image import ImagePlugin from .video import VideoPlugin @@ -103,7 +103,7 @@ def map_input( model_config: "ModelConfig", data: MultiModalDataDict, mm_processor_kwargs: Optional[Dict[str, Any]] = None, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: """ Apply an input mapper to the data passed to the model. @@ -139,7 +139,7 @@ def map_input( merged_dict[input_key] = input_tensor - return MultiModalInputs(merged_dict) + return MultiModalKwargs(merged_dict) def create_input_mapper(self, model_config: "ModelConfig"): """ diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 40a92fed28c87..a518270974f92 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -9,7 +9,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalInputs +from .base import MultiModalData, MultiModalKwargs from .image import ImagePlugin if TYPE_CHECKING: @@ -55,7 +55,7 @@ def _default_input_mapper( ctx: InputContext, data: MultiModalData[object], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: model_config = ctx.model_config if isinstance(data, list) and len(data) == 1: @@ -79,7 +79,7 @@ def _default_input_mapper( logger.error("Failed to process video (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) raise TypeError(f"Invalid video type: {type(data)}") diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 6330ac027db74..cd4d7eb0e6e4e 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -18,7 +18,7 @@ "CUDA and ROCm flash attention backend.") from err from vllm.logger import init_logger -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, ModelRunner) @@ -280,7 +280,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **kwargs, ) diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 8ebbf6db939bc..994af7c5a455f 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -5,7 +5,7 @@ from vllm.attention import AttentionMetadata from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad from vllm.worker.cpu_model_runner import (CPUModelRunner, @@ -287,7 +287,7 @@ def execute_model( kv_caches, "attn_metadata": model_input.attn_metadata, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 26a15ed645c43..1590184d6f831 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -15,7 +15,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap) + MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) from vllm.utils import make_tensor_with_pad @@ -200,7 +200,7 @@ def _prepare_prompt( slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -225,7 +225,7 @@ def _prepare_prompt( ._compute_multi_modal_input( seq_group_metadata, seq_data, computed_len, seq_group_metadata.mm_processor_kwargs) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( placeholder_map) @@ -297,7 +297,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -520,7 +520,7 @@ def execute_model( kv_caches, "attn_metadata": model_input.attn_metadata, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index ff288d5ca1512..37cfcbf13d7a3 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -8,7 +8,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.pooling_params import PoolingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, SequenceGroupMetadata) @@ -104,7 +104,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device)) if (self.observability_config is not None diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 90a43196084ea..008e0c9745994 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -18,7 +18,7 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.utils import get_architecture_class_name -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs, +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, @@ -206,7 +206,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **seqlen_agnostic_kwargs) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 7e9b2bd13b48a..92d6552b2f428 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs) + MultiModalKwargs) from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) @@ -716,7 +716,7 @@ def _prepare_prompt( context_lens: List[int] = [] query_lens: List[int] = [] prefix_block_tables: List[List[int]] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -777,7 +777,7 @@ def _prepare_prompt( mm_data = seq_group_metadata.multi_modal_data if mm_data: mm_kwargs = self.multi_modal_input_mapper(mm_data) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -876,7 +876,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps= None # FIXME(kzawora): mutli-modality will not work here ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return PreparePromptMetadata(input_tokens=input_tokens, input_positions=input_positions, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a1ec2e85be7b8..e1446192ce3d6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -38,7 +38,7 @@ from vllm.model_executor.models import supports_lora, supports_multimodal from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap, + MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) from vllm.platforms import current_platform from vllm.prompt_adapter.layers import PromptAdapterMapping @@ -252,7 +252,7 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. - multi_modal_inputs: Optional[MultiModalInputs] = None, + multi_model_kwargs: Optional[MultiModalKwargs] = None, multi_modal_placeholder_maps: Optional[Dict[ str, MultiModalPlaceholderMap]] = None, @@ -373,7 +373,7 @@ def __init__( prompt_adapter_prompt_mapping or []) self.prompt_adapter_request = prompt_adapter_request - self.multi_modal_inputs = multi_modal_inputs + self.multi_model_kwargs = multi_model_kwargs self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.prefix_cache_hit = prefix_cache_hit @@ -661,7 +661,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, mm_kwargs = self.multi_modal_input_mapper( mm_data, mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs) - inter_data.multi_modal_inputs = mm_kwargs + inter_data.multi_model_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps # special processing for mrope position deltas. @@ -935,11 +935,11 @@ def build(self) -> ModelInputForGPU: ) # Multi-modal data. - multi_modal_inputs_list = [ - data.multi_modal_inputs for data in self.inter_data_list - if data.multi_modal_inputs is not None + multi_model_kwargs_list = [ + data.multi_model_kwargs for data in self.inter_data_list + if data.multi_model_kwargs is not None ] - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return self.model_input_cls( input_tokens=input_tokens_tensor, @@ -1649,7 +1649,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **seqlen_agnostic_kwargs) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 2da22cbfc7cb5..0ed33e435aa2f 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs) + MultiModalKwargs) from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase @@ -122,7 +122,7 @@ def _prepare_prompt( input_block_ids: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -149,7 +149,7 @@ def _prepare_prompt( mm_data, mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs, ) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) max_seq_len = max(seq_lens) assert max_seq_len > 0 @@ -167,7 +167,7 @@ def _prepare_prompt( dtype=torch.long, device=self.device) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return (input_tokens, input_positions, input_block_ids, seq_lens, multi_modal_kwargs) @@ -314,7 +314,7 @@ def execute_model( input_ids=model_input.input_tokens, positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), ) diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index c9c87ea748081..378e1e06039b2 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.openvino import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap) + MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import SequenceGroupMetadata from vllm.worker.model_runner_base import ModelRunnerBase @@ -102,7 +102,7 @@ def _prepare_model_input( seq_lens: List[int] = [] past_lens: List[int] = [] query_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -226,7 +226,7 @@ def _prepare_model_input( mm_data, mm_processor_kwargs=seq_group_metadata. mm_processor_kwargs) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -275,7 +275,7 @@ def _prepare_model_input( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return ModelInput( input_tokens, @@ -341,7 +341,7 @@ def execute_model( kv_caches, "attn_metadata": attn_metadata, - **MultiModalInputs.as_kwargs(multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {}, device=self.device), } diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index bae8b469767b2..c9e637c057979 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -18,7 +18,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalPlaceholderMap, + MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata @@ -160,7 +160,7 @@ def _prepare_prompt( input_positions: List[int] = [] slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_model_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -192,7 +192,7 @@ def _prepare_prompt( .from_seq_group(seq_group_metadata, positions_range) mm_kwargs = self.runner.multi_modal_input_mapper(mm_data) - multi_modal_inputs_list.append(mm_kwargs) + multi_model_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -264,7 +264,7 @@ def _prepare_prompt( block_tables=torch.tensor([], device=self.device, dtype=torch.int), ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -565,7 +565,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device)) # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: