diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 5add7ff0c15c9..cd2bfd8bb5bf4 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -17,30 +17,35 @@ source /etc/environment #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test -# Run basic model test -docker exec cpu-test bash -c " - set -e - pip install pytest pytest-asyncio \ - decord einops librosa peft Pillow sentence-transformers soundfile \ - transformers_stream_generator matplotlib datamodel_code_generator - pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported for CPU yet - # pytest -v -s tests/models/embedding/language - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language/test_models.py - # Chunked prefill not supported for CPU yet - # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model - pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" +function cpu_tests() { + # Run basic model test + docker exec cpu-test bash -c " + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + # Embedding models are not supported for CPU yet + # pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/encoder_decoder/language + pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" -# online inference -docker exec cpu-test bash -c " - set -e - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ - --backend vllm \ - --dataset-name random \ - --model facebook/opt-125m \ - --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" + # online inference + docker exec cpu-test bash -c " + set -e + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" +} + +# All of CPU tests are expected to be finished less than 25 mins. +export -f cpu_tests +timeout 25m bash -c "cpu_tests" diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 25a448e63be27..8d4f4d1a681f2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -19,50 +19,55 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 -# offline inference -docker exec cpu-test-avx2 bash -c " - set -e - python3 examples/offline_inference.py" +function cpu_tests() { + # offline inference + docker exec cpu-test-avx2 bash -c " + set -e + python3 examples/offline_inference.py" -# Run basic model test -docker exec cpu-test bash -c " - set -e - pip install pytest pytest-asyncio \ - decord einops librosa peft Pillow sentence-transformers soundfile \ - transformers_stream_generator matplotlib datamodel_code_generator - pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported for CPU yet - # pytest -v -s tests/models/embedding/language - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language/test_models.py - # Chunked prefill not supported for CPU yet - # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model - pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + # Run basic model test + docker exec cpu-test bash -c " + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + # Embedding models are not supported for CPU yet + # pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/encoder_decoder/language + pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" -# Run compressed-tensor test -docker exec cpu-test bash -c " - set -e - pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" + # Run compressed-tensor test + docker exec cpu-test bash -c " + set -e + pytest -s -v \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" -# Run AWQ test -docker exec cpu-test bash -c " - set -e - pytest -s -v \ - tests/quantization/test_ipex_quant.py" + # Run AWQ test + docker exec cpu-test bash -c " + set -e + pytest -s -v \ + tests/quantization/test_ipex_quant.py" -# online inference -docker exec cpu-test bash -c " - set -e - export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=48-92 - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ - --backend vllm \ - --dataset-name random \ - --model facebook/opt-125m \ - --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" + # online inference + docker exec cpu-test bash -c " + set -e + export VLLM_CPU_KVCACHE_SPACE=10 + export VLLM_CPU_OMP_THREADS_BIND=48-92 + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" +} + +# All of CPU tests are expected to be finished less than 25 mins. +export -f cpu_tests +timeout 25m bash -c "cpu_tests"