diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md index 88af07afc7018..e4f2171e84ff7 100644 --- a/docs/source/design/multimodal/multimodal_index.md +++ b/docs/source/design/multimodal/multimodal_index.md @@ -45,39 +45,39 @@ adding_multimodal_plugin ### Base Classes ```{eval-rst} -.. autodata:: vllm.multimodal.NestedTensors +.. automodule:: vllm.multimodal.base + :members: + :show-inheritance: ``` -```{eval-rst} -.. autodata:: vllm.multimodal.BatchedTensorInputs -``` +### Input Classes ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalDataBuiltins +.. automodule:: vllm.multimodal.inputs :members: :show-inheritance: ``` -```{eval-rst} -.. autodata:: vllm.multimodal.MultiModalDataDict -``` +### Audio Classes ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalKwargs +.. automodule:: vllm.multimodal.audio :members: :show-inheritance: ``` +### Image Classes + ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalPlugin +.. automodule:: vllm.multimodal.image :members: :show-inheritance: ``` -### Image Classes +### Video Classes ```{eval-rst} -.. automodule:: vllm.multimodal.image +.. automodule:: vllm.multimodal.video :members: :show-inheritance: ``` diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 1e5a46946c6c0..d67ae3fc5fa7e 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -296,35 +296,37 @@ def from_seq_group( ``MultiModalPlaceholderMap`` that relates the multi-modal embedding vectors to their corresponding placeholders. - Consider the following scenarios: + Examples: - Prompt: |AAAA BBBB What's in these images?| - Positions: |.................................| + .. code-block:: - images = [A, B] - src_ranges = [(0, 4), (4, 8)] - dest_ranges = [(0, 4), (5, 9)] + Prompt: |AAAA BBBB What's in these images?| + Positions: |.................................| - Prompt: |AAAA BBBB What's in these images?| - Positions: | ..... | + images = [A, B] + src_ranges = [(0, 4), (4, 8)] + dest_ranges = [(0, 4), (5, 9)] - images = [A, B] - src_ranges = [(2, 4), (4, 6)] - dest_ranges = [(0, 2), (3, 5)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ..... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | ......... | + images = [A, B] + src_ranges = [(2, 4), (4, 6)] + dest_ranges = [(0, 2), (3, 5)] - images = [B] - src_ranges = [(0, 4)] - dest_ranges = [(0, 4)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ......... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | .......................| + images = [B] + src_ranges = [(0, 4)] + dest_ranges = [(0, 4)] - images = [] - src_ranges = [] - dest_ranges = [] + Prompt: |AAAA BBBB What's in these images?| + Positions: | .......................| + + images = [] + src_ranges = [] + dest_ranges = [] """ seq_mm_data = seq_group.multi_modal_data seq_mm_placeholders = seq_group.multi_modal_placeholders diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 126b8bacbbe25..1fbda6e0b8750 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -204,9 +204,14 @@ class PlaceholderRange(TypedDict): """ Placeholder location information for multi-modal data. - For example: - Prompt: AAAA BBBB What is in these images? + Example: + + Prompt: :code:`AAAA BBBB What is in these images?` + Images A and B will have: + + .. code-block:: + A: { "offset": 0, "length": 4 } B: { "offset": 5, "length": 4 } """ @@ -376,31 +381,31 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): A dictionary that represents the keyword arguments to :meth:`~torch.nn.Module.forward`. - The metadata `items_by_key` defines how to split batched keyword + The metadata :code:`items_by_key` defines how to split batched keyword arguments corresponding to each data item in :class:`MultiModalDataItems`: - - For a keyword argument, we can access the `i`th item in the batch via - `items_by_key[key][i]`. + - For a keyword argument, we can access the :code:`i` th item in the batch + via :code:`items_by_key[key][i]`. - We can gather the keyword arguments belonging to a modality by finding - the keys with items that belong to that modality, then accessing the - `i`th item in the batch for each such key. - + the keys with items that belong to that modality, then accessing + the :code:`i` th item in the batch for each such key. + Example: - ```python - # All items belong to the "image" modality - items_by_key={ - "pixel_values": [a, b, c, d], # "image" modality - "image_grid_thw": [e, f, g, h], # "image" modality - "pixel_values_video": [h, i, j], # "video" modality - "video_grid_thw": [k, l, m], # "video" modality - } - ``` + .. code-block:: python + + # All items belong to the "image" modality + items_by_key={ + "pixel_values": [a, b, c, d], # "image" modality + "image_grid_thw": [e, f, g, h], # "image" modality + "pixel_values_video": [h, i, j], # "video" modality + "video_grid_thw": [k, l, m], # "video" modality + } - The keyword arguments belonging to the first image are - `{"pixel_values": a, "image_grid_thw": e}`. + :code:`{"pixel_values": a, "image_grid_thw": e}`. - The keyword arguments belonging to the second video are - `{"pixel_values_video": i, "video_grid_thw": l}`. + :code:`{"pixel_values_video": i, "video_grid_thw": l}`. """ @staticmethod