diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 2c30ae98a5b38..1751873318523 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -706,7 +706,11 @@ def _cached_call_fine(
 
                 for k, v in processed_modal_item.items():
                     # Remove the extra batch dimension (if it exists)
-                    processed_modal_items[k].append(v.squeeze(0))
+                    # NOTE: v may be a list instead of a tensor
+                    if len(v) == 1:
+                        v = v[0]
+
+                    processed_modal_items[k].append(v)
 
             for k, vs in processed_modal_items.items():
                 # Try to merge elements into a single tensor