From 32e49402f7142e02ffa551a8fc376e4a84305ebe Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 8 Nov 2024 23:30:04 +0800 Subject: [PATCH] [CI/Build] Update CPU tests to include all "standard" tests (#5481) Signed-off-by: DarkLight1337 Signed-off-by: Loc Huynh --- .buildkite/run-cpu-test-ppc64le.sh | 21 ++++++++++------ .buildkite/run-cpu-test.sh | 25 +++++++++++++------ .buildkite/test-pipeline.yaml | 3 +-- pyproject.toml | 3 ++- requirements-test.in | 5 ---- .../audio_language/test_ultravox.py | 17 ++++++++++--- .../vision_language/test_h2ovl.py | 1 - .../vision_language/test_models.py | 11 +++----- .../vision_language/test_phi3v.py | 2 -- tests/models/utils.py | 3 +-- vllm/assets/image.py | 2 +- vllm/model_executor/models/ultravox.py | 4 +-- vllm/multimodal/utils.py | 8 +++--- vllm/worker/cpu_worker.py | 6 ++++- 14 files changed, 63 insertions(+), 48 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index a63c95e51002f..5add7ff0c15c9 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg # Run basic model test docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" \ - --ignore=tests/models/test_embedding.py \ - --ignore=tests/models/test_oot_registration.py \ - --ignore=tests/models/test_registry.py \ - --ignore=tests/models/test_jamba.py \ - --ignore=tests/models/test_mamba.py \ - --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + # Embedding models are not supported for CPU yet + # pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/encoder_decoder/language + pytest -v -s tests/models/decoder_only/language/test_models.py + # Chunked prefill not supported for CPU yet + # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # online inference docker exec cpu-test bash -c " + set -e python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 064d7c77ab570..25a448e63be27 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 # offline inference -docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" +docker exec cpu-test-avx2 bash -c " + set -e + python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + # Embedding models are not supported for CPU yet + # pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language \ - --ignore=tests/models/test_fp8.py \ - --ignore=tests/models/decoder_only/language/test_jamba.py \ - --ignore=tests/models/decoder_only/language/test_mamba.py \ - --ignore=tests/models/decoder_only/language/test_granitemoe.py \ - --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + pytest -v -s tests/models/decoder_only/language/test_models.py + # Chunked prefill not supported for CPU yet + # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # Run compressed-tensor test docker exec cpu-test bash -c " + set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test docker exec cpu-test bash -c " + set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # online inference docker exec cpu-test bash -c " + set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=48-92 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 705e81d15ad65..2c5d74e7abcbf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -269,7 +269,6 @@ steps: source_file_dependencies: - benchmarks/ commands: - - pip install aiohttp - bash run-benchmarks.sh - label: Quantization Test # 33min @@ -331,7 +330,7 @@ steps: commands: - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py -- label: Decoder-only Multi-Modal Models Test (Standard) +- label: Decoder-only Multi-Modal Models Test (Standard) # 26min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ diff --git a/pyproject.toml b/pyproject.toml index bae8645502dea..1385a15d07878 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,8 @@ skip_gitignore = true [tool.pytest.ini_options] markers = [ "skip_global_cleanup", - "core_model: run this model test in each PR instead of just daily", + "core_model: enable this model test in each PR instead of only nightly", + "cpu_model: enable this model test in CPU tests", "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", "skip_v1: do not run this test with v1", ] diff --git a/requirements-test.in b/requirements-test.in index 1b4b9ba78ed9c..76f6de2f77c34 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -12,9 +12,7 @@ decord # required for video tests einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests -opencv-python # required for video tests peft -requests ray[adag]==2.35 sentence-transformers # required for embedding tests soundfile # required for audio tests @@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test # TODO: Add this after fully implementing llava(mantis) # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test -# Benchmarking -aiohttp - # quantization bitsandbytes>=0.44.0 buildkite-test-collector==0.1.9 diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index d14e88b4e5b26..e100c6b9bb906 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -5,11 +5,11 @@ import pytest_asyncio from transformers import AutoModel, AutoTokenizer, BatchEncoding -from tests.utils import RemoteOpenAIServer from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ....conftest import HfRunner, VllmRunner +from ....utils import RemoteOpenAIServer from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_3" @@ -39,7 +39,10 @@ def audio(request): return AudioAsset(request.param) -@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS)) +@pytest.fixture(params=[ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def server(request, audio_assets): args = [ "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager", @@ -185,7 +188,10 @@ def run_multi_audio_test( @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, num_logprobs: int, vllm_kwargs: dict) -> None: @@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, max_tokens: int, num_logprobs: int, vllm_kwargs: dict) -> None: diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py index ad9aa3104750b..45a7365204403 100644 --- a/tests/models/decoder_only/vision_language/test_h2ovl.py +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -14,7 +14,6 @@ "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names "h2oai/h2ovl-mississippi-2b", ] -target_dtype = "bfloat16" def run_preprocessing_test( diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 3dbfaafb781af..163752e9fe06e 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -94,7 +94,7 @@ ), limit_mm_per_prompt={"image": 4}, )], - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "paligemma": VLMTestInfo( models=["google/paligemma-3b-mix-224"], @@ -111,7 +111,8 @@ "pixel_values" ), vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, - dtype="half" if current_platform.is_rocm() else ("half", "float"), + dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm() + else ("half", "float")), marks=[pytest.mark.core_model], ), "qwen2_vl": VLMTestInfo( @@ -128,7 +129,7 @@ max_num_seqs=2, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), #### Extended model tests @@ -172,7 +173,6 @@ use_tokenizer_eos=True, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, - dtype="bfloat16" if current_platform.is_cpu() else "half", image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), "glm4": VLMTestInfo( @@ -245,7 +245,6 @@ models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - dtype="half", num_video_frames=16, max_model_len=16384, postprocess_inputs=model_utils.get_key_type_post_processor( @@ -404,7 +403,6 @@ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=4096, - dtype="bfloat16" if current_platform.is_cpu() else "half", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, custom_test_opts=[ @@ -419,7 +417,6 @@ test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, max_num_seqs=2, - dtype="half", postprocess_inputs=model_utils.get_key_type_post_processor( "pixel_values" ), diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index b9c20ddb2d746..82eae0705c9ba 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, target_dtype = "half" -if current_platform.is_cpu(): - target_dtype = "bfloat16" # ROCm Triton FA can run into shared memory issues with these models, # use other backends in the meantime diff --git a/tests/models/utils.py b/tests/models/utils.py index f7802d98ad678..0eb3f61f1f047 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -5,7 +5,6 @@ from vllm.config import ModelConfig, TaskOption from vllm.inputs import InputContext -from vllm.platforms import current_platform from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs TokensText = Tuple[List[int], str] @@ -270,7 +269,7 @@ def build_model_context(model_name: str, if tokenizer_name is None: tokenizer_name = model_name if dtype is None: - dtype = "bfloat16" if current_platform.is_cpu() else "half" + dtype = "half" model_config = ModelConfig( model_name, diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 5eec78c328903..389ecd5c869bc 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor: """ image_path = get_vllm_public_assets(filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR) - return torch.load(image_path) + return torch.load(image_path, map_location="cpu") diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 3a343986a9345..411584b1a6c3c 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): if sr != feature_extractor.sampling_rate: try: import librosa - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[audio] for audio support.") from None + "Please install vllm[audio] for audio support.") from exc audio = librosa.resample(audio, orig_sr=sr, target_sr=feature_extractor.sampling_rate) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 0c666b8cc2e69..bee3c25dbd8dd 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]: try: import librosa import soundfile - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[audio] for audio support.") from None + "Please install vllm[audio] for audio support.") from exc return librosa, soundfile @@ -344,9 +344,9 @@ def try_import_video_packages() -> Any: try: import cv2 import decord - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[video] for video support.") from None + "Please install vllm[video] for video support.") from exc return cv2, decord diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 2914f520d823c..162e1e4be873b 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -151,7 +151,11 @@ def __init__( self.local_omp_cpuid = omp_cpuids.split("|")[rank] ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner - if self.model_config.is_encoder_decoder: + if self.model_config.task == "embedding": + raise NotImplementedError( + "Embedding models are not supported for CPU backend") + # ModelRunnerClass = CPUEmbeddingModelRunner + elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner self.model_runner: CPUModelRunner = ModelRunnerClass( vllm_config=vllm_config,