diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md
index 88af07afc7018..e4f2171e84ff7 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/design/multimodal/multimodal_index.md
@@ -45,39 +45,39 @@ adding_multimodal_plugin
 ### Base Classes
 
 ```{eval-rst}
-.. autodata:: vllm.multimodal.NestedTensors
+.. automodule:: vllm.multimodal.base
+    :members:
+    :show-inheritance:
 ```
 
-```{eval-rst}
-.. autodata:: vllm.multimodal.BatchedTensorInputs
-```
+### Input Classes
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
+.. automodule:: vllm.multimodal.inputs
     :members:
     :show-inheritance:
 ```
 
-```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
-```
+### Audio Classes
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalKwargs
+.. automodule:: vllm.multimodal.audio
     :members:
     :show-inheritance:
 ```
 
+### Image Classes
+
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalPlugin
+.. automodule:: vllm.multimodal.image
     :members:
     :show-inheritance:
 ```
 
-### Image Classes
+### Video Classes
 
 ```{eval-rst}
-.. automodule:: vllm.multimodal.image
+.. automodule:: vllm.multimodal.video
     :members:
     :show-inheritance:
 ```
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 1e5a46946c6c0..d67ae3fc5fa7e 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -296,35 +296,37 @@ def from_seq_group(
         ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
         vectors to their corresponding placeholders.
 
-        Consider the following scenarios:
+        Examples:
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |.................................|
+        .. code-block::
 
-            images      = [A, B]
-            src_ranges  = [(0, 4), (4, 8)]
-            dest_ranges = [(0, 4), (5, 9)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |.................................|
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |  .....                          |
+                images      = [A, B]
+                src_ranges  = [(0, 4), (4, 8)]
+                dest_ranges = [(0, 4), (5, 9)]
 
-            images      = [A, B]
-            src_ranges  = [(2, 4), (4, 6)]
-            dest_ranges = [(0, 2), (3, 5)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |  .....                          |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |     .........                   |
+                images      = [A, B]
+                src_ranges  = [(2, 4), (4, 6)]
+                dest_ranges = [(0, 2), (3, 5)]
 
-            images      = [B]
-            src_ranges  = [(0, 4)]
-            dest_ranges = [(0, 4)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |     .........                   |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |          .......................|
+                images      = [B]
+                src_ranges  = [(0, 4)]
+                dest_ranges = [(0, 4)]
 
-            images      = []
-            src_ranges  = []
-            dest_ranges = []
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |          .......................|
+
+                images      = []
+                src_ranges  = []
+                dest_ranges = []
         """
         seq_mm_data = seq_group.multi_modal_data
         seq_mm_placeholders = seq_group.multi_modal_placeholders
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 126b8bacbbe25..1fbda6e0b8750 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -204,9 +204,14 @@ class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
 
-    For example:
-        Prompt: AAAA BBBB What is in these images?
+    Example:
+
+        Prompt: :code:`AAAA BBBB What is in these images?`
+
         Images A and B will have:
+
+        .. code-block::
+
             A: { "offset": 0, "length": 4 }
             B: { "offset": 5, "length": 4 }
     """
@@ -376,31 +381,31 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
     A dictionary that represents the keyword arguments to
     :meth:`~torch.nn.Module.forward`.
 
-    The metadata `items_by_key` defines how to split batched keyword
+    The metadata :code:`items_by_key` defines how to split batched keyword
     arguments corresponding to each data item in :class:`MultiModalDataItems`:
 
-    - For a keyword argument, we can access the `i`th item in the batch via
-      `items_by_key[key][i]`.
+    - For a keyword argument, we can access the :code:`i` th item in the batch
+      via :code:`items_by_key[key][i]`.
     - We can gather the keyword arguments belonging to a modality by finding
-      the keys with items that belong to that modality, then accessing the
-      `i`th item in the batch for each such key.
-    
+      the keys with items that belong to that modality, then accessing
+      the :code:`i` th item in the batch for each such key.
+
     Example:
 
-        ```python
-        # All items belong to the "image" modality
-        items_by_key={
-            "pixel_values": [a, b, c, d],  # "image" modality
-            "image_grid_thw": [e, f, g, h],  # "image" modality
-            "pixel_values_video": [h, i, j],  # "video" modality
-            "video_grid_thw": [k, l, m],  # "video" modality
-        }
-        ```
+        .. code-block:: python
+
+            # All items belong to the "image" modality
+            items_by_key={
+                "pixel_values": [a, b, c, d],  # "image" modality
+                "image_grid_thw": [e, f, g, h],  # "image" modality
+                "pixel_values_video": [h, i, j],  # "video" modality
+                "video_grid_thw": [k, l, m],  # "video" modality
+            }
 
         - The keyword arguments belonging to the first image are
-          `{"pixel_values": a, "image_grid_thw": e}`.
+          :code:`{"pixel_values": a, "image_grid_thw": e}`.
         - The keyword arguments belonging to the second video are
-          `{"pixel_values_video": i, "video_grid_thw": l}`.
+          :code:`{"pixel_values_video": i, "video_grid_thw": l}`.
     """
 
     @staticmethod