diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c6f8316412e2f..f3294c3c88192 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -242,7 +242,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py parallelism: 4 - label: "PyTorch Fullgraph Smoke Test" # 9min @@ -533,6 +533,7 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_minicpmv_tp.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py deleted file mode 100644 index 8017c21b29da0..0000000000000 --- a/tests/lora/test_minicpmv.py +++ /dev/null @@ -1,78 +0,0 @@ -from typing import List - -import pytest - -import vllm -from vllm.assets.image import ImageAsset -from vllm.lora.request import LoRARequest -from vllm.platforms import current_platform - -MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" - -PROMPT_TEMPLATE = ( - "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" - "(./)\nWhat is in the image?<|eot_id|>" - "<|start_header_id|>assistant<|end_header_id|>\n\n") - -IMAGE_ASSETS = [ - ImageAsset("stop_sign"), - ImageAsset("cherry_blossom"), -] - -# After fine-tuning with LoRA, all generated content should start begin `A`. -EXPECTED_OUTPUT = [ - "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501 - "A pink cherry blossom tree with a blue sky in the background.", -] - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: - sampling_params = vllm.SamplingParams( - temperature=0, - max_tokens=5, - stop_token_ids=[128001, 128009], # eos_id, eot_id - ) - - inputs = [{ - "prompt": PROMPT_TEMPLATE, - "multi_modal_data": { - "image": asset.pil_image - }, - } for asset in IMAGE_ASSETS] - - outputs = llm.generate( - inputs, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None, - ) - # Print the outputs. - generated_texts: List[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="MiniCPM-V dependency xformers incompatible with ROCm") -def test_minicpmv_lora(minicpmv_lora_files): - llm = vllm.LLM( - MODEL_PATH, - max_num_seqs=2, - enable_lora=True, - max_loras=2, - max_lora_rank=8, - enforce_eager=True, - trust_remote_code=True, - enable_chunked_prefill=True, - ) - output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) - for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output1[i]) - output2 = do_sample(llm, minicpmv_lora_files, lora_id=2) - for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output2[i]) diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 05aad03d9d280..3b0f18325a40b 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -3,10 +3,10 @@ import pytest import vllm +from tests.utils import fork_new_process_for_each_test from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest - -from ..utils import multi_gpu_test +from vllm.platforms import current_platform MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -17,13 +17,11 @@ IMAGE_ASSETS = [ ImageAsset("stop_sign"), - ImageAsset("cherry_blossom"), ] # After fine-tuning with LoRA, all generated content should start begin `A`. EXPECTED_OUTPUT = [ "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501 - "A pink cherry blossom tree with a blue sky in the background.", ] @@ -50,37 +48,40 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: # Print the outputs. generated_texts: List[str] = [] for output in outputs: - prompt = output.prompt generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Generated text: {generated_text!r}") return generated_texts -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded): +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, - enable_lora=True, max_num_seqs=2, + enable_lora=True, max_loras=2, max_lora_rank=8, - tensor_parallel_size=2, + enforce_eager=True, trust_remote_code=True, - fully_sharded_loras=fully_sharded, enable_chunked_prefill=True, ) - - output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) - + output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) + assert EXPECTED_OUTPUT[i].startswith(output1[i]) + output2 = do_sample(llm, minicpmv_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output2[i]) -@multi_gpu_test(num_gpus=4) -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded): +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, enable_lora=True, @@ -90,9 +91,32 @@ def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded): tensor_parallel_size=4, trust_remote_code=True, enforce_eager=True, - fully_sharded_loras=fully_sharded, enable_chunked_prefill=True, ) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) + + +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files): + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=2, + max_loras=2, + max_lora_rank=8, + tensor_parallel_size=4, + trust_remote_code=True, + fully_sharded_loras=True, + enable_chunked_prefill=True, + ) + output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) + output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output_tp[i])