Merge branch 'main' into omer/run-loader

omer-dayan · Nov 14, 2024 · 0e5335d · 0e5335d
2 parents d3379a2 + 294bf46
commit 0e5335d
Show file tree

Hide file tree

Showing 239 changed files with 9,371 additions and 3,568 deletions.
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
@@ -18,15 +18,16 @@ source /etc/environment
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
 
 function cpu_tests() {
+  set -e
+
   # Run basic model test
   docker exec cpu-test bash -c "
     set -e
     pip install pytest pytest-asyncio \
       decord einops librosa peft Pillow sentence-transformers soundfile \
       transformers_stream_generator matplotlib datamodel_code_generator
     pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    # Embedding models are not supported for CPU yet
-    # pytest -v -s tests/models/embedding/language
+    pytest -v -s tests/models/embedding/language
     pytest -v -s tests/models/encoder_decoder/language
     pytest -v -s tests/models/decoder_only/language/test_models.py
     pytest -v -s tests/models/decoder_only/audio_language -m cpu_model

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -4,22 +4,28 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
 # Try building the docker image
-numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C $CORE_RANGE -N $NUMA_NODE docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C $CORE_RANGE -N $NUMA_NODE docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE  \
+ --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \
+ --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 function cpu_tests() {
+  set -e
+
   # offline inference
   docker exec cpu-test-avx2 bash -c "
     set -e
@@ -32,8 +38,7 @@ function cpu_tests() {
       decord einops librosa peft Pillow sentence-transformers soundfile \
       transformers_stream_generator matplotlib datamodel_code_generator
     pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    # Embedding models are not supported for CPU yet
-    # pytest -v -s tests/models/embedding/language
+    pytest -v -s tests/models/embedding/language
     pytest -v -s tests/models/encoder_decoder/language
     pytest -v -s tests/models/decoder_only/language/test_models.py
     pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
@@ -56,7 +61,7 @@ function cpu_tests() {
   docker exec cpu-test bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=48-92 
+    export VLLM_CPU_OMP_THREADS_BIND=$CORE_RANGE
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
     python3 benchmarks/benchmark_serving.py \

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -165,6 +165,14 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
+- label: V1 Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1
+
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Set up Python
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: '3.12'
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,7 +37,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
@@ -187,7 +187,8 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
 set(VLLM_EXT_SRC
   "csrc/cache_kernels.cu"
-  "csrc/attention/attention_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,3 +1,3 @@
 # Contributing to vLLM
 
-You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview/).
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
     pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0