Skip to content

Commit

Permalink
[Bugfix] Fix Idefics3 fails during multi-image inference (vllm-projec…
Browse files Browse the repository at this point in the history
…t#11080)

Signed-off-by: B-201 <[email protected]>
  • Loading branch information
B-201 authored Dec 11, 2024
1 parent 61b1d2f commit 2e32f5d
Showing 1 changed file with 13 additions and 8 deletions.
21 changes: 13 additions & 8 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ class Idefics3ImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""
Shape: `(batch_size * num_images, num_channels, height, width)`
Shape: `(batch_size * num_images * num_patches,
num_channels, height, width)`
"""
pixel_attention_mask: Optional[torch.BoolTensor]

Expand Down Expand Up @@ -520,13 +521,17 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")

return Idefics3ImagePixelInputs(type="pixel_values",
data=self._validate_pixel_values(
flatten_bn(pixel_values,
concat=True)),
pixel_attention_mask=flatten_bn(
pixel_attention_mask,
concat=True))
if isinstance(pixel_values, list):
pixel_values = torch.cat(pixel_values, dim=1)
pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1)
else:
pixel_values = flatten_bn(pixel_values)
pixel_attention_mask = flatten_bn(pixel_attention_mask)

return Idefics3ImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
pixel_attention_mask=pixel_attention_mask)

raise AssertionError("This line should be unreachable.")

Expand Down

0 comments on commit 2e32f5d

Please sign in to comment.