diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 2c30ae98a5b38..1751873318523 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -706,7 +706,11 @@ def _cached_call_fine( for k, v in processed_modal_item.items(): # Remove the extra batch dimension (if it exists) - processed_modal_items[k].append(v.squeeze(0)) + # NOTE: v may be a list instead of a tensor + if len(v) == 1: + v = v[0] + + processed_modal_items[k].append(v) for k, vs in processed_modal_items.items(): # Try to merge elements into a single tensor