From 629c3e79f663e79ca766963199cba4089b61836c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 6 Jan 2025 13:27:35 +0000 Subject: [PATCH] Fix LLaVA-NeXT feature size calculation (for real) Signed-off-by: DarkLight1337 --- .../processing/test_llava_next.py | 3 ++- .../processing/test_llava_onevision.py | 3 ++- vllm/model_executor/models/llava_next.py | 25 +++++++++---------- vllm/model_executor/models/llava_onevision.py | 25 +++++++++---------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py index 6c8d300717de4..37a6d334ee60c 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py @@ -17,7 +17,8 @@ def processor_for_llava_next(): @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198)]) + (488, 183), (198, 176), (176, 198), + (161, 184), (184, 161)]) @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_prompt_replacements( processor_for_llava_next, diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py index 71adde6568a17..ed3e2db799be7 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py @@ -18,7 +18,8 @@ def processor_for_llava_onevision(): @pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198)]) + (488, 183), (198, 176), (176, 198), + (161, 184), (184, 161)]) @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_prompt_replacements( processor_for_llava_onevision, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c76ec164a3087..e90226f100295 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -121,30 +121,29 @@ def _get_num_unpadded_features( num_patch_height: int, num_patch_width: int, ) -> tuple[int, int]: - current_height = npatches * num_patch_height - current_width = npatches * num_patch_width - # NOTE: Use float32 to remain consistent with HF output - original_aspect_ratio = np.array(original_width / original_height, - dtype=np.float32) - current_aspect_ratio = np.array(current_width / current_height, - dtype=np.float32) + current_height = np.float32(npatches * num_patch_height) + current_width = np.float32(npatches * num_patch_width) + + original_width = np.float32(original_width) # type: ignore + original_height = np.float32(original_height) # type: ignore + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height if original_aspect_ratio > current_aspect_ratio: - scale_factor = np.array(current_width / original_width, - dtype=np.float32) + scale_factor = current_width / original_width new_height = int(original_height * scale_factor) padding = (current_height - new_height) // 2 current_height -= 2 * padding else: - scale_factor = np.array(current_height / original_height, - dtype=np.float32) + scale_factor = current_height / original_height new_width = int(original_width * scale_factor) padding = (current_width - new_width) // 2 current_width -= 2 * padding - unpadded_features = current_height * current_width - newline_features = current_height + unpadded_features = int(current_height * current_width) + newline_features = int(current_height) return (unpadded_features, newline_features) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 6dccc1e0d3b8d..0440745332485 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -104,30 +104,29 @@ def _get_num_unpadded_features( num_patch_height: int, num_patch_width: int, ) -> tuple[int, int]: - current_height = npatches * num_patch_height - current_width = npatches * num_patch_width - # NOTE: Use float32 to remain consistent with HF output - original_aspect_ratio = np.array(original_width / original_height, - dtype=np.float32) - current_aspect_ratio = np.array(current_width / current_height, - dtype=np.float32) + current_height = np.float32(npatches * num_patch_height) + current_width = np.float32(npatches * num_patch_width) + + original_width = np.float32(original_width) # type: ignore + original_height = np.float32(original_height) # type: ignore + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height if original_aspect_ratio > current_aspect_ratio: - scale_factor = np.array(current_width / original_width, - dtype=np.float32) + scale_factor = current_width / original_width new_height = int(original_height * scale_factor) padding = (current_height - new_height) // 2 current_height -= 2 * padding else: - scale_factor = np.array(current_height / original_height, - dtype=np.float32) + scale_factor = current_height / original_height new_width = int(original_width * scale_factor) padding = (current_width - new_width) // 2 current_width -= 2 * padding - unpadded_features = current_height * current_width - newline_features = current_height + unpadded_features = int(current_height * current_width) + newline_features = int(current_height) ratio = math.sqrt(current_height * current_width / (9 * npatches**2)) if ratio > 1.1: