From 4c5d8e8ea91aa19415aa479d81e818913d51414c Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 11 Aug 2024 00:19:33 +0800 Subject: [PATCH] [Bugfix] Fix phi3v batch inference when images have different aspect ratio (#7392) --- tests/models/test_phi3v.py | 5 ++++- tests/tracing/test_tracing.py | 4 ++-- vllm/model_executor/models/phi3v.py | 26 ++++++++++++-------------- vllm/multimodal/utils.py | 9 +++++++-- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 35ffe4ef50a85..3737dc2bd076e 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -81,7 +81,10 @@ def run_test( inputs_per_image = [( [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], + [ + rescale_image_size(image, factor, transpose=idx) + for idx, factor in enumerate(size_factors) + ], ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index a492daf3b49ca..90f26400952b9 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -114,5 +114,5 @@ def test_traces(trace_service): SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time - assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 823c34b101870..d39cf15a1bb96 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -189,7 +189,7 @@ def hd_feature_transform(self, image_features, image_sizes): global_image_features_hd_newline = self.add_image_newline( global_image_features_hd) - all_image_embeddings = [] + batch_image_features_proj = [] # need a for loop to process each image because of different image sizes # (patch arrangement is different for each image) for i, img_size in enumerate(image_sizes): @@ -207,19 +207,17 @@ def hd_feature_transform(self, image_features, image_sizes): sub_image_features_hd) # [sub features, separator, global features] - all_image_embeddings.append( - torch.cat([ - sub_image_features_hd_newline.squeeze( - 0), # (h_crop*12*(w_crop*12+1), 4096) - self.glb_GN.squeeze(0), - global_image_features_hd_newline[i], - ])) - - image_features_proj = self.img_projection( - torch.stack(all_image_embeddings).to(target_device, target_dtype) - ) # (num_images, (h_crop*12*(w_crop*12+1)+1), hidden_size) - - return image_features_proj + image_embeddings = torch.cat([ + sub_image_features_hd_newline.squeeze( + 0), # (h_crop*12*(w_crop*12+1), 4096) + self.glb_GN.squeeze(0), + global_image_features_hd_newline[i], + ]) + img_proj = self.img_projection( + image_embeddings.to(target_device, target_dtype)) + batch_image_features_proj.append(img_proj) + + return batch_image_features_proj def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop): """ diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index bafd208469788..8f7e613cdf90a 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -90,8 +90,13 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: return _load_image_from_bytes(base64.b64decode(image)) -def rescale_image_size(image: Image.Image, size_factor: float) -> Image.Image: +def rescale_image_size(image: Image.Image, + size_factor: float, + transpose: int = -1) -> Image.Image: """Rescale the dimensions of an image by a constant factor.""" new_width = int(image.width * size_factor) new_height = int(image.height * size_factor) - return image.resize((new_width, new_height)) + image = image.resize((new_width, new_height)) + if transpose >= 0: + image = image.transpose(Image.Transpose(transpose)) + return image