diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 573c3740f0bbb..93b3e3fe91663 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -82,20 +82,9 @@ steps: num_gpus: 2 commands: - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s distributed/test_chunked_prefill_distributed.py + - pytest -v -s distributed/test_multimodal_broadcast.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py @@ -107,11 +96,6 @@ steps: fast_check: true commands: - pytest -v -s distributed/test_pynccl.py - # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. - # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - label: Pipeline Parallelism Test @@ -279,9 +263,6 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py - pytest -v -s -x lora/test_mixtral.py diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 7a0e5673b2cc4..1de2ebab22db4 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -1,15 +1,10 @@ """Compare the outputs of HF and distributed vLLM when using greedy sampling. -vLLM will allocate all the available memory, so we need to run the tests one -by one. The solution is to pass arguments (model name) by environment -variables. + Run: ```sh cd $VLLM_PATH/tests -TEST_DIST_MODEL=facebook/opt-125m pytest \ - distributed/test_basic_distributed_correctness.py -TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \ - distributed/test_basic_distributed_correctness.py +pytest distributed/test_basic_distributed_correctness.py ``` """ import os @@ -19,27 +14,48 @@ from vllm.utils import cuda_device_count_stateless from ..models.utils import check_outputs_equal +from ..utils import fork_new_process_for_each_test -MODELS = [ - os.environ["TEST_DIST_MODEL"], -] -DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND" +TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [5]) +@pytest.mark.parametrize( + "model, distributed_executor_backend, attention_backend, test_suite", [ + ("facebook/opt-125m", "ray", "", "L4"), + ("facebook/opt-125m", "mp", "", "L4"), + ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), + ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), + ("facebook/opt-125m", "ray", "", "A100"), + ("facebook/opt-125m", "mp", "", "A100"), + ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), + ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), + ]) +@fork_new_process_for_each_test def test_models( hf_runner, vllm_runner, example_prompts, model: str, - dtype: str, - max_tokens: int, + distributed_executor_backend: str, + attention_backend: str, + test_suite: str, ) -> None: - distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND) + + if test_suite != TARGET_TEST_SUITE: + pytest.skip(f"Skip test for {test_suite}") + + if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa + # test ray adag + os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" + os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + + if attention_backend: + os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend + + dtype = "half" + max_tokens = 5 # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 1ef085b933793..10921a3852f81 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -1,46 +1,39 @@ """Compare the outputs of HF and distributed vLLM when using greedy sampling. -vLLM will allocate all the available memory, so we need to run the tests one -by one. The solution is to pass arguments (model name) by environment -variables. Run: ```sh -TEST_DIST_MODEL=facebook/opt-125m pytest \ - test_chunked_prefill_distributed.py -TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \ - test_chunked_prefill_distributed.py +pytest test_chunked_prefill_distributed.py ``` """ -import os import pytest from vllm.utils import cuda_device_count_stateless from ..models.utils import check_outputs_equal - -MODELS = [ - os.environ["TEST_DIST_MODEL"], -] -DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND" +from ..utils import fork_new_process_for_each_test @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("chunked_prefill_token_size", [16]) +@pytest.mark.parametrize("model, distributed_executor_backend", [ + ("facebook/opt-125m", "ray"), + ("meta-llama/Llama-2-7b-hf", "ray"), + ("facebook/opt-125m", "mp"), + ("meta-llama/Llama-2-7b-hf", "mp"), +]) +@fork_new_process_for_each_test def test_models( hf_runner, vllm_runner, example_prompts, model: str, - dtype: str, - max_tokens: int, - chunked_prefill_token_size: int, + distributed_executor_backend: str, ) -> None: - distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND) + + dtype = "half" + max_tokens = 5 + chunked_prefill_token_size = 16 # Add a chunked prefill config. max_num_seqs = min(chunked_prefill_token_size, 256) diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py index a99917f586949..2c96358e2e6f2 100644 --- a/tests/distributed/test_multimodal_broadcast.py +++ b/tests/distributed/test_multimodal_broadcast.py @@ -1,44 +1,41 @@ """Compare the outputs of HF and distributed vLLM when using greedy sampling. -The second test will hang if more than one test is run per command, so we need -to run the tests one by one. The solution is to pass arguments (model name) by -environment variables. Run: ```sh -TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \ - test_multimodal_broadcast.py -TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \ - test_multimodal_broadcast.py +pytest -s -v test_multimodal_broadcast.py ``` """ -import os import pytest from vllm.utils import cuda_device_count_stateless -model = os.environ["TEST_DIST_MODEL"] - -if model.startswith("llava-hf/llava-1.5"): - from ..models.test_llava import models, run_test -elif model.startswith("llava-hf/llava-v1.6"): - from ..models.test_llava_next import models, run_test -else: - raise NotImplementedError(f"Unsupported model: {model}") - - -@pytest.mark.parametrize("tensor_parallel_size", [2]) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, - tensor_parallel_size: int, dtype: str, max_tokens: int, - num_logprobs: int) -> None: - if cuda_device_count_stateless() < tensor_parallel_size: - pytest.skip( - f"Need at least {tensor_parallel_size} GPUs to run the test.") - - distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") +from ..utils import fork_new_process_for_each_test + + +@pytest.mark.skipif(cuda_device_count_stateless() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize("model, distributed_executor_backend", [ + ("llava-hf/llava-1.5-7b-hf", "ray"), + ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"), + ("llava-hf/llava-1.5-7b-hf", "mp"), + ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"), +]) +@fork_new_process_for_each_test +def test_models(hf_runner, vllm_runner, image_assets, model: str, + distributed_executor_backend: str) -> None: + + dtype = "half" + max_tokens = 5 + num_logprobs = 5 + tensor_parallel_size = 2 + + if model.startswith("llava-hf/llava-1.5"): + from ..models.test_llava import models, run_test + elif model.startswith("llava-hf/llava-v1.6"): + from ..models.test_llava_next import models, run_test + else: + raise NotImplementedError(f"Unsupported model: {model}") run_test( hf_runner,