diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 3b7fa0f2d94b3..f78e360b7afd3 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -6,28 +6,23 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - # rename the files to change linux -> manylinux1 - - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" - - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + - "bash .buildkite/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 11.8 wheel" - key: block-build-cu118-wheel - + # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. + # However, this block can be uncommented to save some compute hours. + # - block: "Build CUDA 11.8 wheel" + # key: block-build-cu118-wheel + - label: "Build wheel - CUDA 11.8" - depends_on: block-build-cu118-wheel + # depends_on: block-build-cu118-wheel agents: queue: cpu_queue commands: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - # rename the files to change linux -> manylinux1 - - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" - - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" - - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" + - "bash .buildkite/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 79526adef2a79..5d7a0bff90963 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -27,9 +27,9 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - pytest -v -s tests/models/embedding/language - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/language -m cpu_model + pytest -v -s tests/models/embedding/language -m cpu_model + pytest -v -s tests/models/encoder_decoder/language -m cpu_model pytest -v -s tests/models/decoder_only/audio_language -m cpu_model pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index b3771bb268e22..14756b5964aaf 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -9,8 +9,8 @@ CORE_RANGE=${CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} # Try building the docker image -numactl -C $CORE_RANGE -N $NUMA_NODE docker build -t cpu-test -f Dockerfile.cpu . -numactl -C $CORE_RANGE -N $NUMA_NODE docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } @@ -18,10 +18,10 @@ trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \ - --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \ - --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 function cpu_tests() { set -e @@ -38,9 +38,9 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - pytest -v -s tests/models/embedding/language - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/language -m cpu_model + pytest -v -s tests/models/embedding/language -m cpu_model + pytest -v -s tests/models/encoder_decoder/language -m cpu_model pytest -v -s tests/models/decoder_only/audio_language -m cpu_model pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" @@ -61,7 +61,7 @@ function cpu_tests() { docker exec cpu-test bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=$CORE_RANGE + export VLLM_CPU_OMP_THREADS_BIND=$1 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ @@ -75,4 +75,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 25 mins. export -f cpu_tests -timeout 25m bash -c "cpu_tests" +timeout 25m bash -c "cpu_tests $CORE_RANGE" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fbaa427bb7270..24bf223fb12c0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -313,71 +313,70 @@ steps: ##### models test ##### -- label: Basic Models Test # 10min +- label: Basic Models Test # 30min source_file_dependencies: - vllm/ - tests/models commands: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s models/*.py --ignore=models/test_oot_registration.py + - pytest -v -s models/test_registry.py + - pytest -v -s models/test_initialization.py -- label: Decoder-only Language Models Test (Standard) # 18min +- label: Language Models Test (Standard) # 42min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language commands: - - pytest -v -s models/decoder_only/language -m core_model - - pytest -v -s models/decoder_only/language -m quant_model + - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' + - pytest -v -s models/embedding/language -m core_model + - pytest -v -s models/embedding/vision_language -m core_model -- label: Decoder-only Language Models Test (Extended) # 46min +- label: Language Models Test (Extended) # 50min nightly: true source_file_dependencies: - vllm/ - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language commands: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/language -m 'not core_model' + - pytest -v -s models/embedding/vision_language -m 'not core_model' -- label: Decoder-only Multi-Modal Models Test (Standard) # 22min +- label: Multi-Modal Models Test (Standard) # 26min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language commands: - - pytest -v -s models/decoder_only/audio_language -m core_model - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model - # No tests under this group for now - # - pytest -v -s models/decoder_only/audio_language -m quant_model - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model + - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' + - pytest -v -s models/encoder_decoder/language -m core_model + - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m +- label: Multi-Modal Models Test (Extended) # 1h15m nightly: true source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language commands: - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - -- label: Other Models Test # 20min - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/models/embedding/language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/language - - tests/models/encoder_decoder/vision_language - commands: - - pytest -v -s models/embedding/language - - pytest -v -s models/embedding/vision_language - - pytest -v -s models/encoder_decoder/language - - pytest -v -s models/encoder_decoder/vision_language + - pytest -v -s models/encoder_decoder/language -m 'not core_model' + - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh new file mode 100644 index 0000000000000..541b395eddbe7 --- /dev/null +++ b/.buildkite/upload-wheels.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +# Assume wheels are in artifacts/dist/*.whl +wheel_files=(artifacts/dist/*.whl) + +# Check that exactly one wheel is found +if [[ ${#wheel_files[@]} -ne 1 ]]; then + echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" + exit 1 +fi + +# Get the single wheel file +wheel="${wheel_files[0]}" + +# Rename 'linux' to 'manylinux1' in the wheel filename +new_wheel="${wheel/linux/manylinux1}" +mv -- "$wheel" "$new_wheel" +wheel="$new_wheel" + +# Extract the version from the wheel +version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) +echo "Version: $version" + +# If the version contains "dev", rename it to v1.0.0.dev for consistency +if [[ $version == *dev* ]]; then + new_version="1.0.0.dev" + new_wheel="${wheel/$version/$new_version}" + mv -- "$wheel" "$new_wheel" + wheel="$new_wheel" + version="$new_version" +fi + +# Upload the wheel to S3 +aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" +aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" +aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index be0afc6305044..51a73c857ccb2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,73 +2,4 @@ FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) -**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** - ---- - -
- - PR Checklist (Click to Expand) - -

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

- -

PR Title and Classification

-

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

- -

Note: If the PR spans more than one category, please include all relevant prefixes.

- -

Code Quality

- -

The PR need to meet the following code quality standards:

- - - -

Adding or changing kernels

-

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

- - -

Notes for Large Changes

-

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

- -

What to Expect for the Reviews

- -

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

- - - -

Thank You

- -

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

- - -
- - +**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html ** diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh index 3b2da7b9f8966..3246c6f9bc4b7 100755 --- a/.github/scripts/cleanup_pr_body.sh +++ b/.github/scripts/cleanup_pr_body.sh @@ -15,19 +15,36 @@ NEW=/tmp/new_pr_body.txt gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}" cp "${OLD}" "${NEW}" -# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" -sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}" - # Remove "FIX #xxxx (*link existing issues this PR will resolve*)" sed -i '/FIX #xxxx.*$/d' "${NEW}" # Remove "FILL IN THE PR DESCRIPTION HERE" sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}" +# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" +sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" + +# Remove HTML
section that includes text of "PR Checklist (Click to Expand)" +python3 - <.*?.*?PR Checklist \(Click to Expand\).*?.*?
', re.DOTALL) +content = re.sub(pattern, '', content) + +with open("${NEW}", "w") as file: + file.write(content) +EOF + # Run this only if ${NEW} is different than ${OLD} if ! cmp -s "${OLD}" "${NEW}"; then - echo "Updating PR body" gh pr edit --body-file "${NEW}" "${PR_NUMBER}" + echo + echo "Updated PR body:" + echo + cat "${NEW}" else echo "No changes needed" fi diff --git a/README.md b/README.md index b75bfc5c699a7..0ef073210d070 100644 --- a/README.md +++ b/README.md @@ -15,16 +15,8 @@ Easy, fast, and cheap LLM serving for everyone --- -**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo** - -We are excited to announce the last in-person vLLM meetup of the year! -Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist! -Register [here](https://lu.ma/h0qvrajz) and be a part of the event! - ---- - - *Latest News* 🔥 +- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing). - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). @@ -108,6 +100,7 @@ vLLM is a community project. Our compute resources for development and testing a - Dropbox - Google Cloud - Lambda Lab +- Nebius - NVIDIA - Replicate - Roblox diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index bdb8ea8e2a5dc..e9fc037a46965 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -251,6 +251,19 @@ def sample_hf_requests( "url": f"data:image/jpeg;base64,{image_base64}" }, } + elif "image" in data and isinstance(data["image"], str): + if (data["image"].startswith("http://") or \ + data["image"].startswith("file://")): + image_url = data["image"] + else: + image_url = f"file://{data['image']}" + + mm_content = { + "type": "image_url", + "image_url": { + "url": image_url + }, + } else: mm_content = None diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 8b99f0843aaf6..741cd0c82dc89 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -98,6 +98,9 @@ void paged_attention_v1_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V1(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V1(64); break; diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index 3a7a9dee916aa..6de8d0bdd5b8d 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -104,6 +104,9 @@ void paged_attention_v2_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V2(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V2(64); break; diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e3953c7c45719..e73eca1b345fd 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -385,6 +385,9 @@ void paged_attention_v1_impl_launcher( int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { + case 32: + LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); + break; case 64: LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); break; @@ -702,6 +705,9 @@ void paged_attention_v2_impl_launcher( int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { + case 32: + LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); + break; case 64: LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); break; diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst index a3962e96e7913..c87f01aa263b3 100644 --- a/docs/source/community/meetups.rst +++ b/docs/source/community/meetups.rst @@ -5,6 +5,7 @@ vLLM Meetups We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- `The seventh vLLM meetup `__, with Snowflake, November 14th 2024. `[Slides] `__ - `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__ - `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ - `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index 52fbf9a577c7e..c6f83b3a92ca0 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a - Dropbox - Google Cloud - Lambda Lab +- Nebius - NVIDIA - Replicate - Roblox diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.rst index ac2d2b2fe4103..4cea0afdaea74 100644 --- a/docs/source/contributing/overview.rst +++ b/docs/source/contributing/overview.rst @@ -41,15 +41,6 @@ Testing Contribution Guidelines ======================= -DCO and Signed-off-by ----------------------- - -When contributing changes to this project, you must agree to the `DCO `_. -Commits must include a ``Signed-off-by:`` header which certifies agreement with -the terms of the `DCO `_. - -Using ``-s`` with ``git commit`` will automatically add this header. - Issues ------ @@ -61,7 +52,110 @@ If you encounter a bug or have a feature request, please `search existing issues Pull Requests & Code Reviews ---------------------------- -Please check the PR checklist in the `PR template `_ for a detailed guide for contribution. +Thank you for your contribution to vLLM! Before submitting the pull request, +please ensure the PR meets the following criteria. This helps vLLM maintain the +code quality and improve the efficiency of the review process. + +DCO and Signed-off-by +^^^^^^^^^^^^^^^^^^^^^ + +When contributing changes to this project, you must agree to the `DCO `_. +Commits must include a ``Signed-off-by:`` header which certifies agreement with +the terms of the `DCO `_. + +Using ``-s`` with ``git commit`` will automatically add this header. + +PR Title and Classification +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Only specific types of PRs will be reviewed. The PR title is prefixed +appropriately to indicate the type of change. Please use one of the following: + +- ``[Bugfix]`` for bug fixes. +- ``[CI/Build]`` for build or continuous integration improvements. +- ``[Doc]`` for documentation fixes and improvements. +- ``[Model]`` for adding a new model or improving an existing model. Model name + should appear in the title. +- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server, + ``LLM`` class, etc.) +- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels. +- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``, + ``AsyncLLMEngine``, ``Scheduler``, etc.) +- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should + appear in the prefix (e.g., ``[Hardware][AMD]``). +- ``[Misc]`` for PRs that do not fit the above categories. Please use this + sparingly. + +.. note:: + If the PR spans more than one category, please include all relevant prefixes. + +Code Quality +^^^^^^^^^^^^ + +The PR needs to meet the following code quality standards: + +- We adhere to `Google Python style guide + `_ and `Google C++ style guide + `_. +- Pass all linter checks. Please use `format.sh + `_ to format your + code. +- The code needs to be well-documented to ensure future contributors can easily + understand the code. +- Include sufficient tests to ensure the project stays correct and robust. This + includes both unit tests and integration tests. +- Please add documentation to ``docs/source/`` if the PR modifies the + user-facing behaviors of vLLM. It helps vLLM users understand and utilize the + new features or changes. + +Adding or Changing Kernels +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Each custom kernel needs a schema and one or more implementations to be registered with PyTorch. + +- Make sure custom ops are registered following PyTorch guidelines: + `Custom C++ and CUDA Operators `_ + and `The Custom Operators Manual `_. +- Custom operations that return ``Tensors`` require meta-functions. + Meta-functions should be implemented and registered in Python so that dynamic + dims can be handled automatically. See above documents for a description of + meta-functions. +- Use `torch.library.opcheck() `_ + to test the function registration and meta-function for any registered ops. + See ``tests/kernels`` for examples. +- When changing the C++ signature of an existing op, the schema must be updated + to reflect the changes. +- If a new custom type is needed, see the following document: + `Custom Class Support in PT2 `_. + +Notes for Large Changes +^^^^^^^^^^^^^^^^^^^^^^^ + +Please keep the changes as concise as possible. For major architectural changes +(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue +(RFC) discussing the technical design and justification. Otherwise, we will tag +it with ``rfc-required`` and might not go through the PR. + +What to Expect for the Reviews +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The goal of the vLLM team is to be a *transparent reviewing machine*. We would +like to make the review process transparent and efficient and make sure no +contributor feels confused or frustrated. However, the vLLM team is small, so we +need to prioritize some PRs over others. Here is what you can expect from the +review process: + +- After the PR is submitted, the PR will be assigned to a reviewer. Every + reviewer will pick up the PRs based on their expertise and availability. +- After the PR is assigned, the reviewer will provide status updates every 2-3 + days. If the PR is not reviewed within 7 days, please feel free to ping the + reviewer or the vLLM team. +- After the review, the reviewer will put an ``action-required`` label on the PR + if there are changes required. The contributor should address the comments and + ping the reviewer to re-review the PR. +- Please respond to all comments within a reasonable time frame. If a comment + isn't clear or you disagree with a suggestion, feel free to ask for + clarification or discuss the suggestion. Thank You --------- diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst index 15f0c8ccf77ee..58a888b17ba53 100644 --- a/docs/source/design/class_hierarchy.rst +++ b/docs/source/design/class_hierarchy.rst @@ -1,3 +1,5 @@ +.. _class_hierarchy: + vLLM's Class Hierarchy ======================= diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst new file mode 100644 index 0000000000000..bfca702b9267a --- /dev/null +++ b/docs/source/design/plugin_system.rst @@ -0,0 +1,62 @@ +.. _plugin_system: + +vLLM's Plugin System +==================== + +The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. + +How Plugins Work in vLLM +------------------------ + +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`class_hierarchy`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. + +How vLLM Discovers Plugins +-------------------------- + +vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: + +.. code-block:: python + + # inside `setup.py` file + from setuptools import setup + + setup(name='vllm_add_dummy_model', + version='0.1', + packages=['vllm_add_dummy_model'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_model:register"] + }) + + # inside `vllm_add_dummy_model.py` file + def register(): + from vllm import ModelRegistry + + if "MyLlava" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model("MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava") + +For more information on adding entry points to your package, please check the `official documentation `__. + +Every plugin has three parts: + +1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins. + +2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name. + +3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module. + +What Can Plugins Do? +-------------------- + +Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. + +Guidelines for Writing Plugins +------------------------------ + +- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. + +Compatibility Guarantee +----------------------- + +vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. \ No newline at end of file diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 75ab2b6ba02dc..22cc684a1c778 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -44,15 +44,18 @@ Requirements Provision Cloud TPUs ==================== -You can provision Cloud TPUs using the `Cloud TPU API `_` -or the `queued resources `_` -API. This section shows how to create TPUs using the queued resource API. -For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. -`Queued resources `_ -enable you to request Cloud TPU resources in a queued manner. When you request -queued resources, the request is added to a queue maintained by the Cloud TPU -service. When the requested resource becomes available, it's assigned to your -Google Cloud project for your immediate exclusive use. +You can provision Cloud TPUs using the `Cloud TPU API `_ +or the `queued resources `_ +API. This section shows how to create TPUs using the queued resource API. For +more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. +Queued resources enable you to request Cloud TPU resources in a queued manner. +When you request queued resources, the request is added to a queue maintained by +the Cloud TPU service. When the requested resource becomes available, it's +assigned to your Google Cloud project for your immediate exclusive use. + +.. note:: + In all of the following commands, replace the ALL CAPS parameter names with + appropriate values. See the parameter descriptions table for more information. Provision a Cloud TPU with the queued resource API -------------------------------------------------- @@ -68,6 +71,7 @@ Create a TPU v5e with 4 TPU chips: --runtime-version RUNTIME_VERSION \ --service-account SERVICE_ACCOUNT + .. list-table:: Parameter descriptions :header-rows: 1 @@ -81,12 +85,13 @@ Create a TPU v5e with 4 TPU chips: * - PROJECT_ID - Your Google Cloud project * - ZONE - - The `zone `_ where you - want to create your Cloud TPU. + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ * - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, followed by a - '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU - with 4 cores. For more information, see `TPU versions `_. + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. * - RUNTIME_VERSION - The TPU VM runtime version to use. For more information see `TPU VM images `_. * - SERVICE_ACCOUNT @@ -98,7 +103,15 @@ Connect to your TPU using SSH: .. code-block:: bash - gcloud compute tpus tpu-vm ssh TPU_NAME + gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE + +Install Miniconda + +.. code-block:: bash + + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + bash Miniconda3-latest-Linux-x86_64.sh + source ~/.bashrc Create and activate a Conda environment for vLLM: @@ -162,9 +175,11 @@ Run the Docker image with the following command: .. note:: - Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. - The compilation time may take 20~30 minutes in the first run. - However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the + possible input shapes and compiles an XLA graph for each shape. The + compilation time may take 20~30 minutes in the first run. However, the + compilation time reduces to ~5 minutes afterwards because the XLA graphs are + cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). .. tip:: @@ -173,7 +188,8 @@ Run the Docker image with the following command: .. code-block:: console from torch._C import * # noqa: F403 - ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory + ImportError: libopenblas.so.0: cannot open shared object file: No such + file or directory Install OpenBLAS with the following command: diff --git a/docs/source/index.rst b/docs/source/index.rst index 77c71afb13ac5..6fd1007a47248 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -159,6 +159,7 @@ Documentation design/class_hierarchy design/huggingface_integration + design/plugin_system design/input_processing/model_inputs_index design/kernel/paged_attention design/multimodal/multimodal_index diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index c6d88cc38e99b..a70ebf99c746f 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -102,11 +102,11 @@ This method should load the weights from the HuggingFace's checkpoint file and a Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. 6. Out-of-Tree Model Integration --------------------------------------------- +-------------------------------- -We also provide a way to integrate a model without modifying the vLLM codebase. Step 2, 3, 4 are still required, but you can skip step 1 and 5. +You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`. -Just add the following lines in your code: +To register the model, use the following code: .. code-block:: python @@ -114,7 +114,7 @@ Just add the following lines in your code: from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: .. code-block:: python @@ -123,19 +123,8 @@ If your model imports modules that initialize CUDA, consider instead lazy-import ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") .. important:: - If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. Read more about that :ref:`here `. -If you are running api server with :code:`vllm serve `, you can wrap the entrypoint with the following code: - -.. code-block:: python - - from vllm import ModelRegistry - from your_code import YourModelForCausalLM - ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - - if __name__ == '__main__': - import runpy - runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') - -Save the above code in a file and run it with :code:`python your_file.py `. +.. note:: + Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 161733c049bbe..96a513d42753b 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -330,16 +330,24 @@ Text Embedding - :code:`BAAI/bge-multilingual-gemma2`, etc. - - ✅︎ - * - :code:`MistralModel` - - Mistral-based + * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. + - Llama-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - ✅︎ - ✅︎ + * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` + - Qwen2-based + - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc. + - ✅︎ + - ✅︎ .. important:: Some model architectures support both generation and embedding tasks. In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. +.. tip:: + You can override the model's pooling method by passing :code:`--override-pooler-config`. + Reward Modeling --------------- @@ -355,11 +363,11 @@ Reward Modeling * - :code:`Qwen2ForRewardModel` - Qwen2-based - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - - + - ✅︎ - ✅︎ .. note:: - As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes. + As an interim measure, these models are supported in both offline and online inference via Embeddings API. Classification --------------- @@ -376,11 +384,11 @@ Classification * - :code:`Qwen2ForSequenceClassification` - Qwen2-based - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - - + - ✅︎ - ✅︎ .. note:: - As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now). + As an interim measure, these models are supported in both offline and online inference via Embeddings API. Multimodal Language Models @@ -595,6 +603,9 @@ Multimodal Embedding Some model architectures support both generation and embedding tasks. In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. +.. tip:: + You can override the model's pooling method by passing :code:`--override-pooler-config`. + Model Support Policy ===================== diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 78965813b1213..79d032bf8b211 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -172,12 +172,20 @@ completion = client.chat.completions.create( ] ) ``` -Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like -`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which -format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify -between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match -this, unless explicitly specified. +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the +request. vLLM provides best-effort support to detect this automatically, which is logged as a string like +*"Detected the chat template content format to be..."*, and internally converts incoming requests to match +the detected format, which can be one of: + +- `"string"`: A string. + - Example: `"Hello world"` +- `"openai"`: A list of dictionaries, similar to OpenAI schema. + - Example: `[{"type": "text", "text": "Hello world!"}]` + +If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument +to override which format to use. ## Command line arguments for the server diff --git a/requirements-common.txt b/requirements-common.txt index 8bfa113e74fd6..5c1251a753eef 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -17,7 +17,7 @@ pillow # Required for image processing prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer -lm-format-enforcer == 0.10.6 +lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 typing_extensions >= 4.10 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 @@ -26,7 +26,7 @@ pyzmq msgspec gguf == 0.10.0 importlib_metadata -mistral_common[opencv] >= 1.4.4 +mistral_common[opencv] >= 1.5.0 pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 5d566f8308b70..c49ed9802cde8 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -166,14 +166,14 @@ def iter_params(self, model_name: str): "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4), "mosaicml/mpt-7b": PPTestSettings.fast(), "nvidia/Minitron-8B-Base": PPTestSettings.fast(), - "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "allenai/OLMo-1B-hf": PPTestSettings.fast(), + "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), + "adept/persimmon-8b-chat": PPTestSettings.fast(), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - "adept/persimmon-8b-chat": PPTestSettings.fast(), + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 50444d3abfaf2..686b697c98e03 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -117,6 +117,7 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2): pg1.barrier() +@pytest.mark.skip(reason="This test is flaky and prone to hang.") @multi_gpu_test(num_gpus=4) @pytest.mark.parametrize( "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index e92e2588d01cb..7b1be5a9802fd 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -2,6 +2,7 @@ import pytest +from vllm.config import PoolerConfig from vllm.engine.arg_utils import EngineArgs, nullable_kvs from vllm.utils import FlexibleArgumentParser @@ -32,9 +33,13 @@ def test_limit_mm_per_prompt_parser(arg, expected): def test_valid_pooling_config(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) - args = parser.parse_args(["--pooling-type=MEAN"]) + args = parser.parse_args([ + '--override-pooler-config', + '{"pooling_type": "MEAN"}', + ]) engine_args = EngineArgs.from_cli_args(args=args) - assert engine_args.pooling_type == 'MEAN' + assert engine_args.override_pooler_config == PoolerConfig( + pooling_type="MEAN", ) @pytest.mark.parametrize( diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index e969d33775d86..93660e6118ca8 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -26,7 +26,6 @@ class MockModelConfig: tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" - chat_template_text_format = "string" max_model_len = 100 tokenizer_revision = None multimodal_config = MultiModalConfig() @@ -49,6 +48,7 @@ async def _async_serving_chat_init(): BASE_MODEL_PATHS, response_role="assistant", chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", lora_modules=None, prompt_adapters=None, request_logger=None) @@ -70,6 +70,7 @@ def test_serving_chat_should_set_correct_max_tokens(): BASE_MODEL_PATHS, response_role="assistant", chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", lora_modules=None, prompt_adapters=None, request_logger=None) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 5fa466f8f041f..72477e048eafa 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -6,15 +6,24 @@ from vllm.assets.image import ImageAsset from vllm.config import ModelConfig -from vllm.entrypoints.chat_utils import (parse_chat_messages, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, + parse_chat_messages, + parse_chat_messages_futures, + resolve_chat_template_content_format) from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from ..utils import VLLM_PATH + +EXAMPLES_DIR = VLLM_PATH / "examples" + PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" +ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3" +QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" +LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" @pytest.fixture(scope="function") @@ -26,7 +35,6 @@ def phi3v_model_config(): trust_remote_code=True, dtype="bfloat16", seed=0, - chat_template_text_format="string", limit_mm_per_prompt={ "image": 2, }) @@ -94,19 +102,24 @@ def test_parse_chat_messages_single_image( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in the image?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -121,19 +134,24 @@ async def test_parse_chat_messages_single_image_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_future = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in the image?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -147,24 +165,29 @@ def test_parse_chat_messages_multiple_images( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -181,24 +204,29 @@ async def test_parse_chat_messages_multiple_images_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_future = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -214,27 +242,31 @@ def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": - "text", - "text": - "What's in <|image_1|> and how does it compare to <|image_2|>?" - }] - }], phi3v_model_config, phi3v_tokenizer) - + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": + "text", + "text": + "What's in <|image_1|> and how does it compare to <|image_2|>?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -249,26 +281,35 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": - "text", - "text": - "What's in <|image_1|> and how does it compare to the other one?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": + "text", + "text": + "What's in <|image_1|> and how does it compare to the other one?" # noqa: E501 + } + ] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -285,34 +326,39 @@ def test_parse_chat_messages_multiple_images_across_messages( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in this image?" + }] }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "role": "assistant", + "content": "Some stuff." }, { - "type": "text", - "text": "What about this one?" - }] - }], phi3v_model_config, phi3v_tokenizer) + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What about this one?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [ { @@ -335,7 +381,6 @@ def test_parse_chat_messages_context_text_format( phi3v_model_config, phi3v_tokenizer, ): - phi3v_model_config.chat_template_text_format = "openai" conversation, mm_data = parse_chat_messages( [{ "role": "user", @@ -349,7 +394,11 @@ def test_parse_chat_messages_context_text_format( }, { "role": "user", "content": "What about this one?" - }], phi3v_model_config, phi3v_tokenizer) + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="openai", + ) assert conversation == [ { @@ -389,29 +438,34 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): - parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) def test_parse_chat_messages_rejects_too_many_images_across_messages( @@ -427,39 +481,44 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): - parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in this image?" + }] }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "role": "assistant", + "content": "Some stuff." }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What about these two?" - }] - }], phi3v_model_config, phi3v_tokenizer) + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What about these two?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) def test_parse_chat_messages_multiple_images_uncommon_input( @@ -467,17 +526,22 @@ def test_parse_chat_messages_multiple_images_uncommon_input( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [ - "What's in these images?", { - "image_url": image_url - }, { - "image_url": image_url - } - ] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + "What's in these images?", { + "image_url": image_url + }, { + "image_url": image_url + } + ] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -495,16 +559,21 @@ def test_mllama_single_image( image_url, ): """Ensures that a single image is parsed correctly mllama.""" - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - 'type': 'text', - 'text': 'The content of this image is:' - }, { - "image_url": image_url - }] - }], mllama_model_config, mllama_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + 'type': 'text', + 'text': 'The content of this image is:' + }, { + "image_url": image_url + }] + }], + mllama_model_config, + mllama_tokenizer, + content_format="openai", + ) _assert_mm_data_is_image_input(mm_data, 1) assert conversation == [{ 'role': @@ -524,26 +593,31 @@ def test_mllama_interleaved_images( image_url, ): """Ensures that multiple image are parsed as interleaved dicts.""" - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [ - { - 'type': 'text', - 'text': 'The content of the first image is:' - }, - { - "image_url": image_url - }, - { - 'type': 'text', - 'text': 'The content of the second image is:' - }, - { - "image_url": image_url - }, - ] - }], mllama_model_config, mllama_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + 'type': 'text', + 'text': 'The content of the first image is:' + }, + { + "image_url": image_url + }, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + { + "image_url": image_url + }, + ] + }], + mllama_model_config, + mllama_tokenizer, + content_format="openai", + ) _assert_mm_data_is_image_input(mm_data, 2) assert conversation == [{ 'role': @@ -626,6 +700,7 @@ def get_conversation(is_hf: bool): vllm_conversation, model_config, tokenizer_group, + content_format="openai", ) vllm_result = apply_hf_chat_template( @@ -636,3 +711,89 @@ def get_conversation(is_hf: bool): ) assert hf_result == vllm_result + + +# yapf: disable +@pytest.mark.parametrize( + ("model", "expected_format"), + [(PHI3V_MODEL_ID, "string"), + (QWEN2VL_MODEL_ID, "openai"), + (ULTRAVOX_MODEL_ID, "string"), + (MLLAMA_MODEL_ID, "openai"), + (LLAMA_GUARD_MODEL_ID, "openai")], +) +# yapf: enable +def test_resolve_content_format_hf_defined(model, expected_format): + tokenizer_group = TokenizerGroup( + model, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + tokenizer = tokenizer_group.tokenizer + + chat_template = tokenizer.chat_template + assert isinstance(chat_template, str) + + print("[TEXT]") + print(chat_template) + print("[AST]") + print(_try_extract_ast(chat_template)) + + resolved_format = resolve_chat_template_content_format( + None, # Test detecting the tokenizer's chat_template + "auto", + tokenizer, + ) + + assert resolved_format == expected_format + + +# yapf: disable +@pytest.mark.parametrize( + ("template_path", "expected_format"), + [("template_alpaca.jinja", "string"), + ("template_baichuan.jinja", "string"), + ("template_blip2.jinja", "string"), + ("template_chatglm.jinja", "string"), + ("template_chatglm2.jinja", "string"), + ("template_chatml.jinja", "string"), + ("template_falcon_180b.jinja", "string"), + ("template_falcon.jinja", "string"), + ("template_inkbot.jinja", "string"), + ("template_llava.jinja", "string"), + ("template_vlm2vec.jinja", "openai"), + ("tool_chat_template_granite_20b_fc.jinja", "string"), + ("tool_chat_template_hermes.jinja", "string"), + ("tool_chat_template_internlm2_tool.jinja", "string"), + ("tool_chat_template_llama3.1_json.jinja", "string"), + ("tool_chat_template_llama3.2_json.jinja", "string"), + ("tool_chat_template_mistral_parallel.jinja", "string"), + ("tool_chat_template_mistral.jinja", "string")], +) +# yapf: enable +def test_resolve_content_format_examples(template_path, expected_format): + tokenizer_group = TokenizerGroup( + PHI3V_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + dummy_tokenizer = tokenizer_group.tokenizer + dummy_tokenizer.chat_template = None + + chat_template = load_chat_template(EXAMPLES_DIR / template_path) + assert isinstance(chat_template, str) + + print("[TEXT]") + print(chat_template) + print("[AST]") + print(_try_extract_ast(chat_template)) + + resolved_format = resolve_chat_template_content_format( + chat_template, + "auto", + dummy_tokenizer, + ) + + assert resolved_format == expected_format diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 12c578db0893c..761eb95c423fc 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -86,10 +86,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, assert torch_out.min() >= int8_traits.min and torch_out.max( ) <= int8_traits.max - ops_out = torch.empty_like(x, dtype=torch.int8) - scales_out = torch.empty_like(scales, dtype=torch.float32) - azp_out = torch.empty_like(azps, dtype=torch.int32) - torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out, azp_out) + ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False) if (not torch.allclose(scales_out, scales)): print(torch.argmax(torch.abs(scales_out - scales))) @@ -119,7 +116,8 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, out1 = (x / scale_arg).round().clamp(int8_traits.min, int8_traits.max).to(torch.int8) - out2, _, _ = scaled_int8_quant(x, scale_arg) + out2, scale2, _ = scaled_int8_quant(x, scale_arg) + assert scale2 is scale_arg # big atol to account for rounding errors torch.testing.assert_close(out1, out2, atol=1, rtol=0.0) @@ -145,11 +143,15 @@ def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, out1 = ((x / scale).round() + azp).clamp(int8_traits.min, int8_traits.max).to(torch.int8) - out2 = torch.empty_like(x, dtype=torch.int8) scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda") azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda") - torch.ops._C.static_scaled_int8_quant(out2, x, scale_arg, azp_arg) + out2, scale2, azp2 = scaled_int8_quant(x, + scale_arg, + azp_arg, + symmetric=False) + assert scale2 is scale_arg + assert azp2 is azp_arg # big atol to account for rounding errors torch.testing.assert_close(out1, out2, atol=1, rtol=0.0) @@ -184,6 +186,5 @@ def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None: val_i8 = int8_traits.max if is_max else int8_traits.min expected = torch.full((1, 5), val_i8, dtype=torch.int8, device="cuda") - out = torch.empty_like(expected) - torch.ops._C.static_scaled_int8_quant(out, x, scale, azp) + out, _, _ = scaled_int8_quant(x, scale, azp, symmetric=False) torch.testing.assert_close(expected, out, atol=0, rtol=0) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 17428ebfc2e28..8b23b62826053 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -45,7 +45,7 @@ def test_fused_moe( score = torch.randn((m, e), device="cuda", dtype=dtype) triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) torch_output = torch_moe(a, w1, w2, score, topk) - torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) @pytest.mark.parametrize("dtype", diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 7e5e2780d3916..ed321ba9f00c1 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -4,12 +4,17 @@ from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.models.bert import BertEmbeddingModel +from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform MAX_MODEL_LEN = 128 MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") +MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", + "intfloat/multilingual-e5-large") +REVISION_ROBERTA = os.environ.get("REVISION", "main") + @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") @@ -48,3 +53,42 @@ def test_model_loading_with_params(vllm_runner): assert model._pooler.normalize # assert output assert output + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_roberta_model_loading_with_params(vllm_runner): + """ + Test parameter weight loading with tp>1. + """ + with vllm_runner(model_name=MODEL_NAME_ROBERTA, + revision=REVISION_ROBERTA, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as model: + output = model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_config = model.model.llm_engine.model_config + + model_tokenizer = model.model.llm_engine.tokenizer + + # asserts on the bert model config file + assert model_config.encoder_config["max_seq_length"] == 512 + assert not model_config.encoder_config["do_lower_case"] + + # asserts on the pooling config files + assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name + assert model_config.pooler_config.pooling_norm + + # asserts on the tokenizer loaded + assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" + assert not model_tokenizer.tokenizer_config["do_lower_case"] + + model = model.model.llm_engine.model_executor\ + .driver_worker.model_runner.model + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + # assert output + assert output diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 384ec77e5455a..6542689c3f277 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,6 +33,10 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -293,17 +297,3 @@ def test_jamba_distributed_produces_identical_generation( name_0="vllm_tp_1", name_1="vllm_tp_2", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 2dc231c595ffa..78eab8d5354fd 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,6 +51,10 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -279,17 +283,3 @@ def test_state_cleanup( except ValueError: pytest.fail("Mamba inner state wasn't cleaned up between states, " "could be related to finished_requests_ids") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 6ec4b7e7e3f71..99b5d5694f9f7 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -2,9 +2,13 @@ Run `pytest tests/models/test_mistral.py`. """ +import copy + import pytest from vllm import SamplingParams +from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa + MistralToolParser) from ...utils import check_logprobs_close @@ -58,17 +62,69 @@ }, "required": ["city", "state", "unit"] } + }, +}, { + "type": "function", + "function": { + "name": "rewrite", + "description": "Rewrites text", + "parameters": { + "type": "object", + "required": [], + "properties": { + "text": { + "type": "string", + "description": "The input text to rewrite." + } + } + } } }] -MSGS = [{ - "role": - "user", - "content": ("Can you tell me what the temperate" - " will be in Dallas, in fahrenheit?") -}] -EXPECTED_FUNC_CALL = ( - '[{"name": "get_current_weather", "arguments": ' - '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]') +MSGS = [ + { + "role": "system", + "content": "You are an assistant." + }, + { + "role": + "user", + "content": + "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa + }, + { + "role": + "assistant", + "content": + "", + "tool_calls": [{ + "id": "bbc5b7ede", + "type": "function", + "function": { + "name": + "rewrite", + "arguments": + '{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa + } + }] + }, + { + "role": "tool", + "content": + "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa + "tool_call_id": "bbc5b7ede", + "name": "rewrite" + }, + { + "role": "assistant", + "content": "---\n\nMy English needs improving, maybe I make errors" + }, + { + "role": + "user", + "content": ("Can you tell me what the temperate" + " will be in Dallas, in fahrenheit?") + } +] @pytest.mark.parametrize("model", MODELS) @@ -175,8 +231,23 @@ def test_mistral_function_calling( tokenizer_mode="mistral", config_format="mistral", load_format="mistral") as vllm_model: - outputs = vllm_model.model.chat(MSGS, + + msgs = copy.deepcopy(MSGS) + outputs = vllm_model.model.chat(msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS) - assert outputs[0].outputs[0].text.strip() == EXPECTED_FUNC_CALL + tokenizer = vllm_model.model.get_tokenizer() + tool_parser = MistralToolParser(tokenizer) + + model_output = outputs[0].outputs[0].text.strip() + assert model_output.startswith(tool_parser.bot_token), model_output + parsed_message = tool_parser.extract_tool_calls(model_output, None) + + assert parsed_message.tools_called + assert parsed_message.tool_calls[0].id == "0UAqFzWsD" + assert parsed_message.tool_calls[ + 0].function.name == "get_current_weather" + assert parsed_message.tool_calls[ + 0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa + assert parsed_message.content is None diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index beb1ffb18436e..2a7ed8826d2f3 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -4,37 +4,52 @@ """ import pytest -from vllm.platforms import current_platform - from ...utils import check_logprobs_close -MODELS = [ - "facebook/opt-125m", # opt - "openai-community/gpt2", # gpt2 - # "Milos/slovak-gpt-j-405M", # gptj - # "bigcode/tiny_starcoder_py", # gpt_bigcode - # "EleutherAI/pythia-70m", # gpt_neox - "bigscience/bloom-560m", # bloom - testing alibi slopes - "microsoft/phi-2", # phi - # "stabilityai/stablelm-3b-4e1t", # stablelm - # "bigcode/starcoder2-3b", # starcoder2 - "google/gemma-1.1-2b-it", # gemma - "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 - "meta-llama/Llama-3.2-1B-Instruct", # llama -] - -if not current_platform.is_cpu(): - MODELS += [ - # fused_moe which not supported on CPU - "openbmb/MiniCPM3-4B", - ] - -target_dtype = "half" - -@pytest.mark.core_model -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize( + "model", + [ + pytest.param( + "bigscience/bloom-560m", # bloom - testing alibi slopes + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "openai-community/gpt2", # gpt2 + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param("Milos/slovak-gpt-j-405M"), # gptj + pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode + pytest.param("EleutherAI/pythia-70m"), # gpt_neox + pytest.param( + "google/gemma-1.1-2b-it", # gemma + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "meta-llama/Llama-3.2-1B-Instruct", # llama + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "openbmb/MiniCPM3-4B", + # fused_moe not supported on CPU + marks=[pytest.mark.core_model], + ), + pytest.param( + "facebook/opt-125m", # opt + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "microsoft/phi-2", # phi + marks=[pytest.mark.core_model], + ), + pytest.param( + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + marks=[pytest.mark.core_model], + ), + pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm + pytest.param("bigcode/starcoder2-3b"), # starcoder2 + ]) +@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) def test_models( diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 40ee49cf60742..6321503e7b248 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -9,10 +9,14 @@ import torch from transformers import AutoModelForSequenceClassification -CLASSIFICATION_MODELS = ["jason9693/Qwen2.5-1.5B-apeach"] - -@pytest.mark.parametrize("model", CLASSIFICATION_MODELS) +@pytest.mark.parametrize( + "model", + [ + pytest.param("jason9693/Qwen2.5-1.5B-apeach", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + ], +) @pytest.mark.parametrize("dtype", ["float"]) def test_classification_models( hf_runner, @@ -23,31 +27,19 @@ def test_classification_models( ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSequenceClassification) as hf_model: hf_outputs = hf_model.classify(example_prompts) - print(hf_outputs, vllm_outputs) - # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): hf_output = torch.tensor(hf_output) vllm_output = torch.tensor(vllm_output) assert torch.allclose(hf_output, vllm_output, 1e-3) - - -@pytest.mark.parametrize("model", CLASSIFICATION_MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_classification_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index cd920aec6502e..c3f351ef707be 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -4,23 +4,25 @@ """ import pytest -from vllm.utils import current_platform - from ..utils import check_embeddings_close -# Model, Guard -MODELS = [ - "intfloat/e5-mistral-7b-instruct", - "BAAI/bge-base-en-v1.5", - "BAAI/bge-multilingual-gemma2", -] - -ENCODER_ONLY = [ - "BAAI/bge-base-en-v1.5", -] - -@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize( + "model", + [ + # [Encoder-only] + pytest.param("BAAI/bge-base-en-v1.5", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("intfloat/multilingual-e5-large"), + # [Encoder-decoder] + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("BAAI/bge-multilingual-gemma2", + marks=[pytest.mark.core_model]), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), + ], +) @pytest.mark.parametrize("dtype", ["half"]) def test_models( hf_runner, @@ -29,9 +31,6 @@ def test_models( model, dtype: str, ) -> None: - if model not in ENCODER_ONLY and current_platform.is_cpu(): - pytest.skip("Skip large embedding models test on CPU.") - # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" # sentence_transformers will strip the input texts, see: @@ -44,8 +43,13 @@ def test_models( is_sentence_transformer=True) as hf_model: hf_outputs = hf_model.encode(example_prompts) - with vllm_runner(model, dtype=dtype, max_model_len=None) as vllm_model: + with vllm_runner(model, task="embedding", dtype=dtype, + max_model_len=None) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) check_embeddings_close( embeddings_0_lst=hf_outputs, diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 9fab5898a06ba..329c6ba279f89 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -88,6 +88,7 @@ def _run_test( @pytest.mark.skipif(transformers.__version__.startswith("4.46"), reason="Model broken with changes in transformers 4.46") +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( @@ -112,6 +113,7 @@ def test_models_text( @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_image( diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index ee411472ba284..6145aff1a5ea2 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -74,6 +74,7 @@ def _run_test( ) +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( @@ -98,6 +99,7 @@ def test_models_text( @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_image( diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 8e8862fadbf04..10aba8427944f 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -14,8 +14,6 @@ from ....utils import multi_gpu_test from ...utils import check_logprobs_close -MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] - def vllm_to_hf_output( vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], @@ -170,7 +168,14 @@ def run_test( ) -@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize( + "model", + [ + pytest.param("facebook/bart-base", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("facebook/bart-large-cnn"), + ], +) @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index a3b1c0950d9a2..77dd1d81f84d7 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -233,6 +233,7 @@ def clear_cache(): @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", @@ -278,6 +279,7 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @@ -326,6 +328,7 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/tests/models/registry.py b/tests/models/registry.py new file mode 100644 index 0000000000000..3848367b6126c --- /dev/null +++ b/tests/models/registry.py @@ -0,0 +1,216 @@ +from dataclasses import dataclass, field +from typing import AbstractSet, Mapping, Optional + + +@dataclass(frozen=True) +class _HfExamplesInfo: + default: str + """The default model to use for testing this architecture.""" + + extras: Mapping[str, str] = field(default_factory=dict) + """Extra models to use for testing this architecture.""" + + tokenizer: Optional[str] = None + """Set the tokenizer to load for this architecture.""" + + tokenizer_mode: str = "auto" + """Set the tokenizer type for this architecture.""" + + speculative_model: Optional[str] = None + """ + The default model to use for testing this architecture, which is only used + for speculative decoding. + """ + + is_available_online: bool = True + """ + Set this to ``False`` if the name of this architecture no longer exists on + the HF repo. To maintain backwards compatibility, we have not removed them + from the main model registry, so without this flag the registry tests will + fail. + """ + + trust_remote_code: bool = False + """The ``trust_remote_code`` level required to load the model.""" + + +# yapf: disable +_TEXT_GENERATION_EXAMPLE_MODELS = { + # [Decoder-only] + "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", + trust_remote_code=True), + "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", + trust_remote_code=True), + "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", + trust_remote_code=True), + "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", + trust_remote_code=True), + "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", + trust_remote_code=True), + "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"), + # ChatGLMModel supports multimodal + "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01", + trust_remote_code=True), + "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"), + "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct", + trust_remote_code=True), + "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"), + "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat", # noqa: E501 + trust_remote_code=True), + "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), + "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), + "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), + "GPT2LMHeadModel": _HfExamplesInfo("gpt2"), + "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"), + "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"), + "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"), + "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), + "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), + "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", + trust_remote_code=True), + "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", + trust_remote_code=True), + "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B", + trust_remote_code=True), + "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), + "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"), + "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), + "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", + is_available_online=False), + "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), + "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 + "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16", + trust_remote_code=True), + "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B", + trust_remote_code=True), + "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), + "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"), # noqa: E501 + "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501 + "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), + "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), + "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), + "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"), + "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"), + "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"), + "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat", + trust_remote_code=True), + "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), + "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), + "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), + "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct", + trust_remote_code=True), + "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", + trust_remote_code=True), + # QWenLMHeadModel supports multimodal + "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"), + "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), + "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", + is_available_online=False), + "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 + is_available_online=False), + "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), + "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), + "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), + "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat", + is_available_online=False, + trust_remote_code=True), + # [Encoder-decoder] + "BartModel": _HfExamplesInfo("facebook/bart-base"), + "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), + # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer + # Therefore, we borrow the BartTokenizer from the original Bart model + "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501 + tokenizer="facebook/bart-base", + trust_remote_code=True), # noqa: E501 +} + +_EMBEDDING_EXAMPLE_MODELS = { + # [Text-only] + "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), + "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), + "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), + "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), + "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), + "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), + "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 + "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 + "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"), + # [Multimodal] + "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), + "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", + trust_remote_code=True), + "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501 +} + +_MULTIMODAL_EXAMPLE_MODELS = { + # [Decoder-only] + "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 + "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 + "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", + extras={"text_only": "THUDM/chatglm3-6b"}, + trust_remote_code=True), + "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", + is_available_online=False), + "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), + "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), + "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", + trust_remote_code=True), + "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501 + "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", + extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501 + "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 + "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 + "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 + "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", + trust_remote_code=True), + "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", + trust_remote_code=True), + "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B", + trust_remote_code=True), + "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"), # noqa: E501 + "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", + trust_remote_code=True), + "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 + tokenizer_mode="mistral"), + "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat", + extras={"text_only": "Qwen/Qwen-7B-Chat"}, # noqa: E501 + trust_remote_code=True), + "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 + "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 + "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), + # [Encoder-decoder] + "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 +} + +_SPECULATIVE_DECODING_EXAMPLE_MODELS = { + "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m", + speculative_model="abhigoyal/vllm-eagle-llama-68m-random"), # noqa: E501 + "MedusaModel": _HfExamplesInfo("JackFram/llama-68m", + speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 + "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", + speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501 +} + +_EXAMPLE_MODELS = { + **_TEXT_GENERATION_EXAMPLE_MODELS, + **_EMBEDDING_EXAMPLE_MODELS, + **_MULTIMODAL_EXAMPLE_MODELS, + **_SPECULATIVE_DECODING_EXAMPLE_MODELS, +} + + +class HfExampleModels: + def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None: + super().__init__() + + self.hf_models = hf_models + + def get_supported_archs(self) -> AbstractSet[str]: + return self.hf_models.keys() + + def get_hf_info(self, model_arch: str) -> _HfExamplesInfo: + return self.hf_models[model_arch] + + +HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py new file mode 100644 index 0000000000000..b8312c2d9b7cc --- /dev/null +++ b/tests/models/test_initialization.py @@ -0,0 +1,55 @@ +from unittest.mock import patch + +import pytest +import transformers +from transformers import PretrainedConfig + +from vllm import LLM + +from .registry import HF_EXAMPLE_MODELS + + +@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) +def test_can_initialize(model_arch): + if (model_arch == "Idefics3ForConditionalGeneration" + and transformers.__version__ < "4.46.0"): + pytest.skip(reason="Model introduced in HF >= 4.46.0") + + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + if not model_info.is_available_online: + pytest.skip("Model is not available online") + + # Avoid OOM + def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: + if hasattr(hf_config, "text_config"): + text_config: PretrainedConfig = hf_config.text_config + else: + text_config = hf_config + + text_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + "num_experts": 2, + "num_experts_per_tok": 2, + "num_local_experts": 2, + }) + + return hf_config + + # Avoid calling model.forward() + def _initialize_kv_caches(self) -> None: + self.cache_config.num_gpu_blocks = 0 + self.cache_config.num_cpu_blocks = 0 + + with patch.object(LLM.get_engine_class(), "_initialize_kv_caches", + _initialize_kv_caches): + LLM( + model_info.default, + tokenizer=model_info.tokenizer, + tokenizer_mode=model_info.tokenizer_mode, + speculative_model=model_info.speculative_model, + num_speculative_tokens=1 if model_info.speculative_model else None, + trust_remote_code=model_info.trust_remote_code, + load_format="dummy", + hf_overrides=hf_overrides, + ) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index a2194fa15f90e..e462dae3dc688 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -14,6 +14,7 @@ from vllm.platforms import current_platform from ..utils import fork_new_process_for_each_test +from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) @@ -73,3 +74,12 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda): "This model no longer initializes CUDA on import. " "Please test using a different one.", stacklevel=2) + + +def test_hf_registry_coverage(): + untested_archs = (ModelRegistry.get_supported_archs() - + HF_EXAMPLE_MODELS.get_supported_archs()) + + assert not untested_archs, ( + "Please add the following architectures to " + f"`tests/models/registry.py`: {untested_archs}") diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 03097569b2b3b..26add5bf6d90d 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -8,6 +8,7 @@ import torch from compressed_tensors.quantization import QuantizationType +from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, @@ -74,6 +75,35 @@ def zp_valid(zp: Optional[torch.Tensor]): assert output +@pytest.mark.parametrize( + "model_path", + [ + "neuralmagic/Llama-3.2-1B-quantized.w8a8" + # TODO static & asymmetric + ]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner, + example_prompts, model_path, + max_tokens, num_logprobs): + dtype = "bfloat16" + + with hf_runner(model_path, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) + + with vllm_runner(model_path, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + def test_compressed_tensors_no_enforce_eager(vllm_runner): model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" with vllm_runner(model_path) as llm: diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 32591ecfe6774..edd079bc7a389 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -8,10 +8,12 @@ import openai import pytest import torch +from huggingface_hub import snapshot_download from tensorizer import EncryptionParams from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs +# yapf conflicts with isort for this docstring # yapf: disable from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, TensorSerializer, @@ -20,13 +22,14 @@ open_stream, serialize_vllm_model, tensorize_vllm_model) +# yapf: enable +from vllm.utils import import_from_path from ..conftest import VllmRunner -from ..utils import RemoteOpenAIServer +from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip -# yapf conflicts with isort for this docstring - +EXAMPLES_PATH = VLLM_PATH / "examples" prompts = [ "Hello, my name is", @@ -94,8 +97,8 @@ def test_can_deserialize_s3(vllm_runner): num_readers=1, s3_endpoint="object.ord1.coreweave.com", )) as loaded_hf_model: - deserialized_outputs = loaded_hf_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_hf_model.generate( + prompts, sampling_params) # noqa: E501 assert deserialized_outputs @@ -111,23 +114,21 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig( - tensorizer_uri=model_path, - encryption_keyfile=key_path - ) + config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, + encryption_keyfile=key_path) serialize_vllm_model(get_torch_model(vllm_model), config_for_serializing) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - with vllm_runner( - model_ref, - load_format="tensorizer", - model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501 + with vllm_runner(model_ref, + load_format="tensorizer", + model_loader_extra_config=config_for_deserializing + ) as loaded_vllm_model: # noqa: E501 - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) # noqa: E501 assert outputs == deserialized_outputs @@ -156,14 +157,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): - from huggingface_hub import snapshot_download - - from examples.multilora_inference import (create_test_prompts, - process_requests) + multilora_inference = import_from_path( + "examples.multilora_inference", + EXAMPLES_PATH / "multilora_inference.py", + ) model_ref = "meta-llama/Llama-2-7b-hf" lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - test_prompts = create_test_prompts(lora_path) + test_prompts = multilora_inference.create_test_prompts(lora_path) # Serialize model before deserializing and binding LoRA adapters with vllm_runner(model_ref, ) as vllm_model: @@ -186,7 +187,8 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): max_num_seqs=50, max_model_len=1000, ) as loaded_vllm_model: - process_requests(loaded_vllm_model.model.llm_engine, test_prompts) + multilora_inference.process_requests( + loaded_vllm_model.model.llm_engine, test_prompts) assert loaded_vllm_model @@ -217,8 +219,11 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ## Start OpenAI API server openai_args = [ - "--dtype", "float16", "--load-format", - "tensorizer", "--model-loader-extra-config", + "--dtype", + "float16", + "--load-format", + "tensorizer", + "--model-loader-extra-config", json.dumps(model_loader_extra_config), ] @@ -251,8 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner): torch.cuda.empty_cache() -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires 2 GPUs") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") def test_tensorizer_with_tp_path_without_template(vllm_runner): with pytest.raises(ValueError): model_ref = "EleutherAI/pythia-1.4b" @@ -271,10 +275,9 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner): ) -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires 2 GPUs") -def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, - tmp_path): +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") +def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( + vllm_runner, tmp_path): model_ref = "EleutherAI/pythia-1.4b" # record outputs from un-sharded un-tensorized model with vllm_runner( @@ -313,13 +316,12 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, disable_custom_all_reduce=True, enforce_eager=True, model_loader_extra_config=tensorizer_config) as loaded_vllm_model: - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) assert outputs == deserialized_outputs - @retry_until_skip(3) def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): gc.collect() @@ -337,8 +339,8 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref, load_format="tensorizer", model_loader_extra_config=config) as loaded_vllm_model: - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) # noqa: E501 assert outputs == deserialized_outputs diff --git a/tests/test_config.py b/tests/test_config.py index df382d22d83ec..3cf90297ce177 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,8 @@ +from dataclasses import asdict + import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, PoolerConfig from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform @@ -108,7 +110,7 @@ def test_get_sliding_window(): reason="Xformers backend is not supported on ROCm.") def test_get_pooling_config(): model_id = "sentence-transformers/all-MiniLM-L12-v2" - minilm_model_config = ModelConfig( + model_config = ModelConfig( model_id, task="auto", tokenizer=model_id, @@ -119,39 +121,31 @@ def test_get_pooling_config(): revision=None, ) - minilm_pooling_config = minilm_model_config._init_pooler_config( - pooling_type=None, - pooling_norm=None, - pooling_returned_token_ids=None, - pooling_softmax=None, - pooling_step_tag_id=None) + pooling_config = model_config._init_pooler_config(None) + assert pooling_config is not None - assert minilm_pooling_config.pooling_norm - assert minilm_pooling_config.pooling_type == PoolingType.MEAN.name + assert pooling_config.normalize + assert pooling_config.pooling_type == PoolingType.MEAN.name @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") def test_get_pooling_config_from_args(): model_id = "sentence-transformers/all-MiniLM-L12-v2" - minilm_model_config = ModelConfig(model_id, - task="auto", - tokenizer=model_id, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype="float16", - revision=None) - - minilm_pooling_config = minilm_model_config._init_pooler_config( - pooling_type='CLS', - pooling_norm=True, - pooling_returned_token_ids=None, - pooling_softmax=None, - pooling_step_tag_id=None) - - assert minilm_pooling_config.pooling_norm - assert minilm_pooling_config.pooling_type == PoolingType.CLS.name + model_config = ModelConfig(model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None) + + override_config = PoolerConfig(pooling_type='CLS', normalize=True) + + pooling_config = model_config._init_pooler_config(override_config) + assert pooling_config is not None + assert asdict(pooling_config) == asdict(override_config) @pytest.mark.skipif(current_platform.is_rocm(), diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh index 0bb6fd2eafa14..d99fa77b96351 100755 --- a/tools/shellcheck.sh +++ b/tools/shellcheck.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -e scversion="stable" @@ -18,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then fi # TODO - fix warnings in .buildkite/run-amd-test.sh -find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -exec sh -c 'git check-ignore -q $1 || shellcheck $1' _ {} \; +find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 8f331a27a20de..b276b8fc25473 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -510,10 +510,16 @@ def cutlass_scaled_mm_azp(a: torch.Tensor, azp_adj: torch.Tensor, azp: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + :param azp_adj: In the per-tensor case, this should include the azp. + Always per-channel. + :param azp: Only set in the per-token case. Per-token if set. + """ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) assert bias is None or bias.numel( ) == b.shape[1] and bias.dtype == out_dtype + assert azp is None or azp.numel() == a.shape[0] m = a.shape[0] n = b.shape[1] @@ -735,7 +741,7 @@ def scaled_int8_quant( azp is None), "azp must only be provided for asymmetric quantization." torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) - return output, scale, None + return output, scale, azp # dynamic-per-token quantization. input_scales = torch.empty((input.numel() // input.shape[-1], 1), diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 6b270ffd5bc00..8df6d4ced9dc6 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -10,7 +10,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 256] + return [32, 64, 80, 96, 112, 128, 256] @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 92023d5b75f5a..076f151ffcb61 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -34,7 +34,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 120, 128, 192, 256] + return [32, 64, 80, 96, 112, 120, 128, 192, 256] @staticmethod def get_kv_cache_shape( diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 2a0cf0002c9dd..eb43604b1399b 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -281,11 +281,11 @@ def __call__(self, graph: torch.fx.Graph): self.dump_graph(graph, "before_fusion") count = self.patterns.apply(graph) - logger.info("Replaced %s patterns", count) + logger.debug("Replaced %s patterns", count) self.dump_graph(graph, "after_pattern_match") # Manually process multi-output matches (and run DCE) self.process_matches(graph) - logger.info("Post-processed %s matches", len(self.matches)) + logger.debug("Post-processed %s matches", len(self.matches)) self.dump_graph(graph, "after_fusion") self.matches.clear() diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py index 0d284246d2576..36597e119d2e1 100644 --- a/vllm/compilation/reshapes.py +++ b/vllm/compilation/reshapes.py @@ -53,7 +53,7 @@ def __call__(self, graph: torch.fx.Graph): graph.erase_node(node) count += 1 - logger.info("Removed %s no-op reshapes", count) + logger.debug("Removed %s no-op reshapes", count) self.dump_graph(graph, "after_reshapes") diff --git a/vllm/config.py b/vllm/config.py index f61665eb2214f..7b433b23e2a7d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3,8 +3,8 @@ import json import warnings from dataclasses import dataclass, field, replace -from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal, - Mapping, Optional, Set, Tuple, Type, Union) +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List, + Literal, Mapping, Optional, Set, Tuple, Type, Union) import torch from transformers import PretrainedConfig @@ -21,7 +21,7 @@ get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model, is_s3 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - print_warning_once) + identity, print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -45,6 +45,9 @@ # "draft" is only used internally for speculative decoding _Task = Literal["generate", "embedding", "draft"] +HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], + PretrainedConfig]] + class ModelConfig: """Configuration for the model. @@ -110,29 +113,19 @@ class ModelConfig: the model name will be the same as `model`. limit_mm_per_prompt: Maximum number of data items per modality per prompt. Only applicable for multimodal models. - override_neuron_config: Initialize non default neuron config or - override default neuron config that are specific to Neuron devices, - this argument will be used to configure the neuron config that - can not be gathered from the vllm arguments. config_format: The config format which shall be loaded. Defaults to 'auto' which defaults to 'hf'. - hf_overrides: Arguments to be forwarded to the HuggingFace config. + hf_overrides: If a dictionary, contains arguments to be forwarded to the + HuggingFace config. If a callable, it is called to update the + HuggingFace config. mm_processor_kwargs: Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. - pooling_type: Used to configure the pooling method in the embedding - model. - pooling_norm: Used to determine whether to normalize the pooled - data in the embedding model. - pooling_softmax: Used to determine whether to softmax the pooled - data in the embedding model. - pooling_step_tag_id: When pooling_step_tag_id is not -1, it indicates - that the score corresponding to the pooling_step_tag_id in the - generated sentence should be returned. Otherwise, it returns - the scores for all tokens. - pooling_returned_token_ids: pooling_returned_token_ids represents a - list of indices for the vocabulary dimensions to be extracted, - such as the token IDs of good_token and bad_token in the - math-shepherd-mistral-7b-prm model. + override_neuron_config: Initialize non default neuron config or + override default neuron config that are specific to Neuron devices, + this argument will be used to configure the neuron config that + can not be gathered from the vllm arguments. + override_pooling_config: Initialize non default pooling config or + override default pooling config for the embedding model. """ def __init__( @@ -162,16 +155,11 @@ def __init__( served_model_name: Optional[Union[str, List[str]]] = None, limit_mm_per_prompt: Optional[Mapping[str, int]] = None, use_async_output_proc: bool = True, - override_neuron_config: Optional[Dict[str, Any]] = None, config_format: ConfigFormat = ConfigFormat.AUTO, - chat_template_text_format: str = "string", - hf_overrides: Optional[Dict[str, Any]] = None, + hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, - pooling_type: Optional[str] = None, - pooling_norm: Optional[bool] = None, - pooling_softmax: Optional[bool] = None, - pooling_step_tag_id: Optional[int] = None, - pooling_returned_token_ids: Optional[List[int]] = None) -> None: + override_neuron_config: Optional[Dict[str, Any]] = None, + override_pooler_config: Optional["PoolerConfig"] = None) -> None: self.model = model self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode @@ -183,15 +171,23 @@ def __init__( if hf_overrides is None: hf_overrides = {} + + if callable(hf_overrides): + hf_overrides_kw = {} + hf_overrides_fn = hf_overrides + else: + hf_overrides_kw = hf_overrides + hf_overrides_fn = identity + if rope_scaling is not None: hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling} - hf_overrides.update(hf_override) + hf_overrides_kw.update(hf_override) msg = ("`--rope-scaling` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_override!r}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) if rope_theta is not None: hf_override = {"rope_theta": rope_theta} - hf_overrides.update(hf_override) + hf_overrides_kw.update(hf_override) msg = ("`--rope-theta` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_override!r}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) @@ -220,16 +216,18 @@ def __init__( self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - self.hf_config = get_config(self.model, trust_remote_code, revision, - code_revision, config_format, - **hf_overrides) + + hf_config = get_config(self.model, trust_remote_code, revision, + code_revision, config_format, **hf_overrides_kw) + hf_config = hf_overrides_fn(hf_config) + self.hf_config = hf_config + self.hf_text_config = get_hf_text_config(self.hf_config) self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) self.use_async_output_proc = use_async_output_proc - self.chat_template_text_format = chat_template_text_format self.mm_processor_kwargs = mm_processor_kwargs # Set enforce_eager to False if the value is unset. @@ -277,13 +275,7 @@ def __init__( supported_tasks, task = self._resolve_task(task, self.hf_config) self.supported_tasks = supported_tasks self.task: Final = task - self.pooler_config = self._init_pooler_config( - pooling_type, - pooling_norm, - pooling_softmax, - pooling_step_tag_id, - pooling_returned_token_ids, - ) + self.pooler_config = self._init_pooler_config(override_pooler_config) self._verify_quantization() self._verify_cuda_graph() @@ -308,27 +300,21 @@ def _get_encoder_config(self): def _init_pooler_config( self, - pooling_type: Optional[str] = None, - pooling_norm: Optional[bool] = None, - pooling_softmax: Optional[bool] = None, - pooling_step_tag_id: Optional[int] = None, - pooling_returned_token_ids: Optional[List[int]] = None + override_pooler_config: Optional["PoolerConfig"], ) -> Optional["PoolerConfig"]: + if self.task == "embedding": - pooling_config = get_pooling_config(self.model, self.revision) - if pooling_config is not None: - # override if user does not - # specifies pooling_type and/or pooling_norm - if pooling_type is None: - pooling_type = pooling_config["pooling_type"] - if pooling_norm is None: - pooling_norm = pooling_config["normalize"] - return PoolerConfig( - pooling_type=pooling_type, - pooling_norm=pooling_norm, - pooling_softmax=pooling_softmax, - pooling_step_tag_id=pooling_step_tag_id, - pooling_returned_token_ids=pooling_returned_token_ids) + user_config = override_pooler_config or PoolerConfig() + + base_config = get_pooling_config(self.model, self.revision) + if base_config is not None: + # Only set values that are not overridden by the user + for k, v in base_config.items(): + if getattr(user_config, k) is None: + setattr(user_config, k, v) + + return user_config + return None def _init_attention_free(self) -> bool: @@ -1784,13 +1770,43 @@ class MultiModalConfig: @dataclass class PoolerConfig: - """Controls the behavior of pooler in embedding model""" + """Controls the behavior of output pooling in embedding models.""" pooling_type: Optional[str] = None - pooling_norm: Optional[bool] = None - pooling_softmax: Optional[bool] = None - pooling_step_tag_id: Optional[int] = None - pooling_returned_token_ids: Optional[List[int]] = None + """ + The pooling method of the embedding model. This should be a key in + :class:`vllm.model_executor.layers.pooler.PoolingType`. + """ + + normalize: Optional[bool] = None + """ + Whether to normalize the pooled outputs. Usually, this should be set to + ``True`` for embedding outputs. + """ + + softmax: Optional[bool] = None + """ + Whether to apply softmax to the pooled outputs. Usually, this should be set + to ``True`` for classification outputs. + """ + + step_tag_id: Optional[int] = None + """ + If set, only the score corresponding to the ``step_tag_id`` in the + generated sentence should be returned. Otherwise, the scores for all tokens + are returned. + """ + + returned_token_ids: Optional[List[int]] = None + """ + A list of indices for the vocabulary dimensions to be extracted, + such as the token IDs of ``good_token`` and ``bad_token`` in the + ``math-shepherd-mistral-7b-prm`` model. + """ + + @staticmethod + def from_json(json_str: str) -> "PoolerConfig": + return PoolerConfig(**json.loads(json_str)) _STR_DTYPE_TO_TORCH_DTYPE = { diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cb178dafdecb0..ec908be1c4471 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -9,14 +9,13 @@ import vllm.envs as envs from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig, - DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, TaskOption, TokenizerPoolConfig, - VllmConfig) + DeviceConfig, HfOverrides, LoadConfig, LoadFormat, + LoRAConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, PoolerConfig, PromptAdapterConfig, + SchedulerConfig, SpeculativeConfig, TaskOption, + TokenizerPoolConfig, VllmConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file @@ -91,7 +90,6 @@ class EngineArgs: task: TaskOption = "auto" skip_tokenizer_init: bool = False tokenizer_mode: str = 'auto' - chat_template_text_format: str = 'string' trust_remote_code: bool = False allowed_local_media_path: str = "" download_dir: Optional[str] = None @@ -128,7 +126,7 @@ class EngineArgs: code_revision: Optional[str] = None rope_scaling: Optional[Dict[str, Any]] = None rope_theta: Optional[float] = None - hf_overrides: Optional[Dict[str, Any]] = None + hf_overrides: Optional[HfOverrides] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None enforce_eager: Optional[bool] = None @@ -187,15 +185,10 @@ class EngineArgs: otlp_traces_endpoint: Optional[str] = None collect_detailed_traces: Optional[str] = None disable_async_output_proc: bool = False - override_neuron_config: Optional[Dict[str, Any]] = None scheduling_policy: Literal["fcfs", "priority"] = "fcfs" - # Pooling configuration. - pooling_type: Optional[str] = None - pooling_norm: Optional[bool] = None - pooling_softmax: Optional[bool] = None - pooling_step_tag_id: Optional[int] = None - pooling_returned_token_ids: Optional[List[int]] = None + override_neuron_config: Optional[Dict[str, Any]] = None + override_pooler_config: Optional[PoolerConfig] = None def __post_init__(self): if not self.tokenizer: @@ -264,24 +257,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'fast tokenizer if available.\n* "slow" will ' 'always use the slow tokenizer. \n* ' '"mistral" will always use the `mistral_common` tokenizer.') - parser.add_argument( - '--chat-template-text-format', - type=str, - default=EngineArgs.chat_template_text_format, - choices=['string', 'openai'], - help='The format to render text content within a chat template. ' - '"string" will keep the content field as a string whereas ' - '"openai" will parse content in the current OpenAI format.') parser.add_argument('--trust-remote-code', action='store_true', help='Trust remote code from huggingface.') parser.add_argument( '--allowed-local-media-path', type=str, - help="Allowing API requests to read local images or videos" - "from directories specified by the server file system." - "This is a security risk." - "Should only be enabled in trusted environments") + help="Allowing API requests to read local images or videos " + "from directories specified by the server file system. " + "This is a security risk. " + "Should only be enabled in trusted environments.") parser.add_argument('--download-dir', type=nullable_str, default=EngineArgs.download_dir, @@ -348,7 +333,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'scaling factors. This should generally be supplied, when ' 'KV cache dtype is FP8. Otherwise, KV cache scaling factors ' 'default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (without scaling) is only supported on cuda version' + 'FP8_E5M2 (without scaling) is only supported on cuda version ' 'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' 'supported for common inference criteria.') parser.add_argument('--max-model-len', @@ -454,9 +439,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'this argument can be seen as a virtual way to increase ' 'the GPU memory size. For example, if you have one 24 GB ' 'GPU and set this to 10, virtually you can think of it as ' - 'a 34 GB GPU. Then you can load a 13B model with BF16 weight,' + 'a 34 GB GPU. Then you can load a 13B model with BF16 weight, ' 'which requires at least 26GB GPU memory. Note that this ' - 'requires fast CPU-GPU interconnect, as part of the model is' + 'requires fast CPU-GPU interconnect, as part of the model is ' 'loaded from CPU memory to GPU memory on the fly in each ' 'model forward pass.') parser.add_argument( @@ -476,7 +461,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, default=None, help='If specified, ignore GPU profiling result and use this number' - 'of GPU blocks. Used for testing preemption.') + ' of GPU blocks. Used for testing preemption.') parser.add_argument('--max-num-batched-tokens', type=int, default=EngineArgs.max_num_batched_tokens, @@ -522,7 +507,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--hf-overrides', type=json.loads, default=EngineArgs.hf_overrides, - help='Extra arguments for the HuggingFace config.' + help='Extra arguments for the HuggingFace config. ' 'This should be a JSON string that will be ' 'parsed into a dictionary.') parser.add_argument('--enforce-eager', @@ -580,7 +565,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--mm-processor-kwargs', default=None, type=json.loads, - help=('Overrides for the multimodal input mapping/processing,' + help=('Overrides for the multimodal input mapping/processing, ' 'e.g., image processor. For example: {"num_crops": 4}.')) # LoRA related configs @@ -609,7 +594,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--lora-dtype', type=str, default=EngineArgs.lora_dtype, - choices=['auto', 'float16', 'bfloat16', 'float32'], + choices=['auto', 'float16', 'bfloat16'], help=('Data type for LoRA. If auto, will default to ' 'base model dtype.')) parser.add_argument( @@ -830,9 +815,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "of the provided names. The model name in the model " "field of a response will be the first name in this " "list. If not specified, the model name will be the " - "same as the `--model` argument. Noted that this name(s)" + "same as the `--model` argument. Noted that this name(s) " "will also be used in `model_name` tag content of " - "prometheus metrics, if multiple names provided, metrics" + "prometheus metrics, if multiple names provided, metrics " "tag will take the first one.") parser.add_argument('--qlora-adapter-name-or-path', type=str, @@ -861,12 +846,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.disable_async_output_proc, help="Disable async output processing. This may result in " "lower performance.") - parser.add_argument( - '--override-neuron-config', - type=json.loads, - default=None, - help="Override or set neuron device configuration. " - "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'") parser.add_argument( '--scheduling-policy', @@ -879,56 +858,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'arrival deciding any ties).') parser.add_argument( - '--pooling-type', - choices=[pt.name for pt in PoolingType], - default=None, - help='Used to configure the pooling method in the embedding model.' - ) - - parser.add_argument('--pooling-norm', - default=None, - action='store_true', - help="Used to determine whether to normalize " - "the pooled data in the embedding model.") - - parser.add_argument('--no-pooling-norm', - default=None, - action='store_false', - dest='pooling_norm', - help="Used to determine whether to normalize " - "the pooled data in the embedding model.") - - parser.add_argument('--pooling-softmax', - default=None, - action='store_true', - help="Used to determine whether to softmax " - "the pooled data in the embedding model.") - - parser.add_argument('--no-pooling-softmax', - default=None, - action='store_false', - dest='pooling_softmax', - help="Used to determine whether to softmax " - "the pooled data in the embedding model.") - - parser.add_argument( - '--pooling-step-tag-id', - type=int, + '--override-neuron-config', + type=json.loads, default=None, - help="When pooling-step-tag-id is not -1, it indicates " - "that the score corresponding to the step-tag-ids in the " - "generated sentence should be returned. Otherwise, it " - "returns the scores for all tokens.") - + help="Override or set neuron device configuration. " + "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'") parser.add_argument( - '--pooling-returned-token-ids', - nargs='+', - type=int, + '--override-pooler-config', + type=PoolerConfig.from_json, default=None, - help="pooling-returned-token-ids represents a list of " - "indices for the vocabulary dimensions to be extracted, " - "such as the token IDs of good_token and bad_token in " - "the math-shepherd-mistral-7b-prm model.") + help="Override or set the pooling method in the embedding model. " + "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'") return parser @@ -947,7 +887,6 @@ def create_model_config(self) -> ModelConfig: # We know this is not None because we set it in __post_init__ tokenizer=cast(str, self.tokenizer), tokenizer_mode=self.tokenizer_mode, - chat_template_text_format=self.chat_template_text_format, trust_remote_code=self.trust_remote_code, allowed_local_media_path=self.allowed_local_media_path, dtype=self.dtype, @@ -969,14 +908,10 @@ def create_model_config(self) -> ModelConfig: served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, use_async_output_proc=not self.disable_async_output_proc, - override_neuron_config=self.override_neuron_config, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, - pooling_type=self.pooling_type, - pooling_norm=self.pooling_norm, - pooling_softmax=self.pooling_softmax, - pooling_step_tag_id=self.pooling_step_tag_id, - pooling_returned_token_ids=self.pooling_returned_token_ids, + override_neuron_config=self.override_neuron_config, + override_pooler_config=self.override_pooler_config, ) def create_load_config(self) -> LoadConfig: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f5299746d845d..9a2d73a020c8f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -262,8 +262,7 @@ def __init__( "num_scheduler_steps=%d, chunked_prefill_enabled=%s " "multi_step_stream_outputs=%s, enable_prefix_caching=%s, " "use_async_output_proc=%s, use_cached_outputs=%s, " - "chat_template_text_format=%s, mm_processor_kwargs=%s, " - "pooler_config=%r)", + "mm_processor_kwargs=%s, pooler_config=%r)", VLLM_VERSION, model_config.model, speculative_config, @@ -296,7 +295,6 @@ def __init__( cache_config.enable_prefix_caching, model_config.use_async_output_proc, use_cached_outputs, - model_config.chat_template_text_format, model_config.mm_processor_kwargs, model_config.pooler_config, ) @@ -2002,9 +2000,6 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) - def is_encoder_decoder_model(self): - return self.input_preprocessor.is_encoder_decoder_model() - def _validate_model_inputs(self, inputs: ProcessorInputs, lora_request: Optional[LoRARequest]): if is_encoder_decoder_inputs(inputs): diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3ca460c47c3bd..abee5ac46391c 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -2,12 +2,14 @@ import codecs import json from abc import ABC, abstractmethod -from collections import defaultdict +from collections import defaultdict, deque from functools import lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) +import jinja2.nodes +import transformers.utils.chat_template_utils as hf_chat_utils # yapf conflicts with isort for this block # yapf: disable from openai.types.chat import (ChatCompletionAssistantMessageParam, @@ -153,6 +155,199 @@ class ConversationMessage(TypedDict, total=False): """The tool calls generated by the model, such as function calls.""" +# Passed in by user +ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] + +# Used internally +_ChatTemplateContentFormat = Literal["string", "openai"] + + +def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool: + if isinstance(node, jinja2.nodes.Name): + return node.ctx == "load" and node.name == varname + + return False + + +def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool: + if isinstance(node, jinja2.nodes.Getitem): + return (_is_var_access(node.node, varname) + and isinstance(node.arg, jinja2.nodes.Const) + and node.arg.value == key) + + if isinstance(node, jinja2.nodes.Getattr): + return _is_var_access(node.node, varname) and node.attr == key + + return False + + +def _is_var_or_elems_access( + node: jinja2.nodes.Node, + varname: str, + key: Optional[str] = None, +) -> bool: + if isinstance(node, jinja2.nodes.Filter): + return (node.node is not None + and _is_var_or_elems_access(node.node, varname, key)) + if isinstance(node, jinja2.nodes.Test): + return _is_var_or_elems_access(node.node, varname, key) + + if (isinstance(node, jinja2.nodes.Getitem) + and isinstance(node.arg, jinja2.nodes.Slice)): + return _is_var_or_elems_access(node.node, varname, key) + + # yapf: disable + return ( + _is_attr_access(node, varname, key) if key + else _is_var_access(node, varname) + ) # yapf: enable + + +def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str): + # Global variable that is implicitly defined at the root + yield root, varname + + # Iterative BFS + related_varnames = deque([varname]) + while related_varnames: + related_varname = related_varnames.popleft() + + for assign_ast in root.find_all(jinja2.nodes.Assign): + lhs = assign_ast.target + rhs = assign_ast.node + + if _is_var_or_elems_access(rhs, related_varname): + assert isinstance(lhs, jinja2.nodes.Name) + yield assign_ast, lhs.name + + # Avoid infinite looping for self-assignment + if lhs.name != related_varname: + related_varnames.append(lhs.name) + + +# NOTE: The proper way to handle this is to build a CFG so that we can handle +# the scope in which each variable is defined, but that is too complicated +def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node): + messages_varnames = [ + varname + for _, varname in _iter_nodes_assign_var_or_elems(root, "messages") + ] + + # Search for {%- for message in messages -%} loops + for loop_ast in root.find_all(jinja2.nodes.For): + loop_iter = loop_ast.iter + loop_target = loop_ast.target + + for varname in messages_varnames: + if _is_var_or_elems_access(loop_iter, varname): + assert isinstance(loop_target, jinja2.nodes.Name) + yield loop_ast, loop_target.name + break + + +def _iter_nodes_assign_content_item(root: jinja2.nodes.Node): + message_varnames = [ + varname for _, varname in _iter_nodes_assign_messages_item(root) + ] + + # Search for {%- for content in message['content'] -%} loops + for loop_ast in root.find_all(jinja2.nodes.For): + loop_iter = loop_ast.iter + loop_target = loop_ast.target + + for varname in message_varnames: + if _is_var_or_elems_access(loop_iter, varname, "content"): + assert isinstance(loop_target, jinja2.nodes.Name) + yield loop_ast, loop_target.name + break + + +def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]: + try: + jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template) + return jinja_compiled.environment.parse(chat_template) + except Exception: + logger.exception("Error when compiling Jinja template") + return None + + +def _detect_content_format( + chat_template: str, + *, + default: _ChatTemplateContentFormat, +) -> _ChatTemplateContentFormat: + jinja_ast = _try_extract_ast(chat_template) + if jinja_ast is None: + return default + + try: + next(_iter_nodes_assign_content_item(jinja_ast)) + except StopIteration: + return "string" + except Exception: + logger.exception("Error when parsing AST of Jinja template") + return default + else: + return "openai" + + +def _resolve_chat_template_content_format( + chat_template: Optional[str], + given_format: ChatTemplateContentFormatOption, + tokenizer: AnyTokenizer, +) -> _ChatTemplateContentFormat: + if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): + tokenizer_chat_template = tokenizer.chat_template + else: + tokenizer_chat_template = None + + jinja_text: Optional[str] + if isinstance(tokenizer_chat_template, str) and chat_template is None: + jinja_text = tokenizer_chat_template + elif (isinstance(tokenizer_chat_template, dict) + and chat_template in tokenizer_chat_template): + jinja_text = tokenizer_chat_template[chat_template] + else: + jinja_text = load_chat_template(chat_template, is_literal=True) + + detected_format = ("string" if jinja_text is None else + _detect_content_format(jinja_text, default="string")) + + return detected_format if given_format == "auto" else given_format + + +@lru_cache +def resolve_chat_template_content_format( + chat_template: Optional[str], + given_format: ChatTemplateContentFormatOption, + tokenizer: AnyTokenizer, +) -> _ChatTemplateContentFormat: + detected_format = _resolve_chat_template_content_format( + chat_template, + given_format, + tokenizer, + ) + + logger.info( + "Detected the chat template content format to be '%s'. " + "You can set `--chat-template-content-format` to override this.", + detected_format, + ) + + if given_format != "auto" and given_format != detected_format: + logger.warning( + "You specified `--chat-template-content-format %s` " + "which is different from the detected format '%s'. " + "If our automatic detection is incorrect, please consider " + "opening a GitHub issue so that we can improve it: " + "https://github.com/vllm-project/vllm/issues/new/choose", + given_format, + detected_format, + ) + + return detected_format + + ModalityStr = Literal["image", "audio", "video"] _T = TypeVar("_T") @@ -407,12 +602,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]): def load_chat_template( - chat_template: Optional[Union[Path, str]]) -> Optional[str]: + chat_template: Optional[Union[Path, str]], + *, + is_literal: bool = False, +) -> Optional[str]: if chat_template is None: return None + + if is_literal: + if isinstance(chat_template, Path): + raise TypeError("chat_template is expected to be read directly " + "from its value") + + return codecs.decode(chat_template, "unicode_escape") + try: with open(chat_template) as f: - resolved_chat_template = f.read() + return f.read() except OSError as e: if isinstance(chat_template, Path): raise @@ -426,10 +632,7 @@ def load_chat_template( # If opening a file fails, set chat template to be args to # ensure we decode so our escape are interpreted correctly - resolved_chat_template = codecs.decode(chat_template, "unicode_escape") - - logger.info("Using supplied chat template:\n%s", resolved_chat_template) - return resolved_chat_template + return load_chat_template(chat_template, is_literal=True) # TODO: Let user specify how to insert multimodal tokens into prompt @@ -464,7 +667,6 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _AudioParser = partial(cast, ChatCompletionContentPartAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) -MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { @@ -542,18 +744,12 @@ def _parse_chat_message_content_parts( role: str, parts: Iterable[ChatCompletionContentPartParam], mm_tracker: BaseMultiModalItemTracker, - chat_template_text_format: str, + *, + wrap_dicts: bool, ) -> List[ConversationMessage]: content: List[Union[str, Dict[str, str]]] = [] mm_parser = mm_tracker.create_parser() - model_config = mm_tracker.model_config - - wrap_dicts = (chat_template_text_format == "openai" - or (model_config.task == "embedding" - and model_config.is_multimodal_model) - or (model_config.hf_config.model_type - in MODEL_KEEP_MULTI_MODAL_CONTENT)) for part in parts: parse_res = _parse_chat_message_content_part( @@ -578,9 +774,11 @@ def _parse_chat_message_content_parts( def _parse_chat_message_content_part( - part: ChatCompletionContentPartParam, - mm_parser: BaseMultiModalContentParser, - wrap_dicts: bool) -> Optional[Union[str, Dict[str, str]]]: + part: ChatCompletionContentPartParam, + mm_parser: BaseMultiModalContentParser, + *, + wrap_dicts: bool, +) -> Optional[Union[str, Dict[str, str]]]: """Parses a single part of a conversation. If wrap_dicts is True, structured dictionary pieces for texts and images will be wrapped in dictionaries, i.e., {"type": "text", "text", ...} and @@ -629,7 +827,7 @@ def _parse_chat_message_content_part( def _parse_chat_message_content( message: ChatCompletionMessageParam, mm_tracker: BaseMultiModalItemTracker, - chat_template_text_format: str, + content_format: _ChatTemplateContentFormat, ) -> List[ConversationMessage]: role = message["role"] content = message.get("content") @@ -645,7 +843,7 @@ def _parse_chat_message_content( role, content, # type: ignore mm_tracker, - chat_template_text_format, + wrap_dicts=(content_format == "openai"), ) for result_msg in result: @@ -684,6 +882,7 @@ def parse_chat_messages( messages: List[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, + content_format: _ChatTemplateContentFormat, ) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]: conversation: List[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -692,7 +891,7 @@ def parse_chat_messages( sub_messages = _parse_chat_message_content( msg, mm_tracker, - model_config.chat_template_text_format, + content_format, ) conversation.extend(sub_messages) @@ -706,6 +905,7 @@ def parse_chat_messages_futures( messages: List[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, + content_format: _ChatTemplateContentFormat, ) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: conversation: List[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) @@ -714,7 +914,7 @@ def parse_chat_messages_futures( sub_messages = _parse_chat_message_content( msg, mm_tracker, - model_config.chat_template_text_format, + content_format, ) conversation.extend(sub_messages) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index a15dbd1c45119..86b0b6893f1d9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -9,12 +9,15 @@ from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) -from vllm.engine.arg_utils import EngineArgs, TaskOption +from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, + TaskOption) from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + ChatTemplateContentFormatOption, apply_hf_chat_template, apply_mistral_chat_template, - parse_chat_messages) + parse_chat_messages, + resolve_chat_template_content_format) from vllm.inputs import PromptType, TextPrompt, TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger @@ -101,7 +104,9 @@ class LLM: disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig` disable_async_output_proc: Disable async output processing. This may result in lower performance. - hf_overrides: Arguments to be forwarded to the HuggingFace config. + hf_overrides: If a dictionary, contains arguments to be forwarded to the + HuggingFace config. If a callable, it is called to update the + HuggingFace config. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See :ref:`engine_args`) @@ -156,15 +161,11 @@ def __init__( max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, - hf_overrides: Optional[dict] = None, + hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, # After positional args are removed, move this right below `model` task: TaskOption = "auto", - pooling_type: Optional[str] = None, - pooling_norm: Optional[bool] = None, - pooling_softmax: Optional[bool] = None, - pooling_step_tag_id: Optional[int] = None, - pooling_returned_token_ids: Optional[List[int]] = None, + override_pooler_config: Optional[PoolerConfig] = None, **kwargs, ) -> None: ''' @@ -200,11 +201,7 @@ def __init__( disable_async_output_proc=disable_async_output_proc, hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, - pooling_type=pooling_type, - pooling_norm=pooling_norm, - pooling_softmax=pooling_softmax, - pooling_step_tag_id=pooling_step_tag_id, - pooling_returned_token_ids=pooling_returned_token_ids, + override_pooler_config=override_pooler_config, **kwargs, ) # Logic to switch between engines is done at runtime instead of import @@ -528,6 +525,7 @@ def chat( use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, chat_template: Optional[str] = None, + chat_template_content_format: ChatTemplateContentFormatOption = "auto", add_generation_prompt: bool = True, continue_final_message: bool = False, tools: Optional[List[Dict[str, Any]]] = None, @@ -544,9 +542,11 @@ def chat( to the OpenAI API. Args: - messages: A list of conversations or a single conversation. - - Each conversation is represented as a list of messages. - - Each message is a dictionary with 'role' and 'content' keys. + messages: A list of conversations or a single conversation. + + - Each conversation is represented as a list of messages. + - Each message is a dictionary with 'role' and 'content' keys. + sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it @@ -556,11 +556,19 @@ def chat( lora_request: LoRA request to use for generation, if any. chat_template: The template to use for structuring the chat. If not provided, the model's default chat template will be used. + chat_template_content_format: The format to render message content. + + - "string" will render the content as a string. + Example: ``"Who are you?"`` + - "openai" will render the content as a list of dictionaries, + similar to OpenAI schema. + Example: ``[{"type": "text", "text": "Who are you?"}]`` + add_generation_prompt: If True, adds a generation template to each message. continue_final_message: If True, continues the final message in - the conversation instead of starting a new one. Cannot be `True` - if `add_generation_prompt` is also `True`. + the conversation instead of starting a new one. Cannot be + ``True`` if ``add_generation_prompt`` is also ``True``. mm_processor_kwargs: Multimodal processor kwarg overrides for this chat request. Only used for offline requests. @@ -581,17 +589,26 @@ def chat( cast(List[ChatCompletionMessageParam], messages) ] + tokenizer = self.get_tokenizer() + model_config = self.llm_engine.get_model_config() + resolved_content_format = resolve_chat_template_content_format( + chat_template, + chat_template_content_format, + tokenizer, + ) + prompts: List[Union[TokensPrompt, TextPrompt]] = [] for msgs in list_of_messages: - tokenizer = self.get_tokenizer() - model_config = self.llm_engine.get_model_config() - # NOTE: _parse_chat_message_content_parts() currently doesn't # handle mm_processor_kwargs, since there is no implementation in # the chat message parsing for it. conversation, mm_data = parse_chat_messages( - msgs, model_config, tokenizer) + msgs, + model_config, + tokenizer, + content_format=resolved_content_format, + ) prompt_data: Union[str, List[int]] if isinstance(tokenizer, MistralTokenizer): @@ -742,7 +759,7 @@ def encode( generation, if any. Returns: - A list of `EmbeddingRequestOutput` objects containing the + A list of ``EmbeddingRequestOutput`` objects containing the generated embeddings in the same order as the input prompts. Note: @@ -969,6 +986,3 @@ def _run_engine( # This is necessary because some requests may be finished earlier than # its previous requests. return sorted(outputs, key=lambda x: int(x.request_id)) - - def _is_encoder_decoder_model(self): - return self.llm_engine.is_encoder_decoder_model() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6a24cdbc6a18f..b0fe061f5db4a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -12,7 +12,7 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import AsyncIterator, Optional, Set +from typing import AsyncIterator, Optional, Set, Tuple import uvloop from fastapi import APIRouter, FastAPI, Request @@ -29,6 +29,7 @@ from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import (make_arg_parser, @@ -57,7 +58,8 @@ from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path +from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, + is_valid_ipv6_address) from vllm.version import __version__ as VLLM_VERSION if envs.VLLM_USE_V1: @@ -528,6 +530,9 @@ def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats + resolved_chat_template = load_chat_template(args.chat_template) + logger.info("Using supplied chat template:\n%s", resolved_chat_template) + state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, @@ -536,7 +541,8 @@ def init_app_state( lora_modules=args.lora_modules, prompt_adapters=args.prompt_adapters, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, @@ -556,7 +562,8 @@ def init_app_state( model_config, base_model_paths, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, ) if model_config.task == "embedding" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, @@ -564,10 +571,23 @@ def init_app_state( base_model_paths, lora_modules=args.lora_modules, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, ) +def create_server_socket(addr: Tuple[str, int]) -> socket.socket: + family = socket.AF_INET + if is_valid_ipv6_address(addr[0]): + family = socket.AF_INET6 + + sock = socket.socket(family=family, type=socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(addr) + + return sock + + async def run_server(args, **uvicorn_kwargs) -> None: logger.info("vLLM API server version %s", VLLM_VERSION) logger.info("args: %s", args) @@ -584,9 +604,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind((args.host or "", args.port)) - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock_addr = (args.host or "", args.port) + sock = create_server_socket(sock_addr) def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index eb08a89293370..24c206a1261f2 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -7,10 +7,11 @@ import argparse import json import ssl -from typing import List, Optional, Sequence, Union +from typing import List, Optional, Sequence, Union, get_args from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str -from vllm.entrypoints.chat_utils import validate_chat_template +from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, + validate_chat_template) from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParserManager @@ -132,6 +133,18 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="The file path to the chat template, " "or the template in single-line form " "for the specified model") + parser.add_argument( + '--chat-template-content-format', + type=str, + default="auto", + choices=get_args(ChatTemplateContentFormatOption), + help='The format to render message content within a chat template.' + '\n\n' + '* "string" will render the content as a string. ' + 'Example: "Hello World"\n' + '* "openai" will render the content as a list of dictionaries, ' + 'similar to OpenAI schema. ' + 'Example: [{"type": "text", "text": "Hello world!"}]') parser.add_argument("--response-role", type=nullable_str, default="assistant", diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 820aefd8800d9..b7b064ae01f05 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -5,9 +5,8 @@ from typing import Any, Dict, List, Literal, Optional, Union import torch -from openai.types.chat import ChatCompletionContentPartParam from pydantic import BaseModel, ConfigDict, Field, model_validator -from typing_extensions import Annotated, Required, TypedDict +from typing_extensions import Annotated from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.pooling_params import PoolingParams @@ -35,26 +34,6 @@ assert _LONG_INFO.max == _MOCK_LONG_INFO.max -class CustomChatCompletionMessageParam(TypedDict, total=False): - """Enables custom roles in the Chat Completion API.""" - role: Required[str] - """The role of the message's author.""" - - content: Union[str, List[ChatCompletionContentPartParam]] - """The contents of the message.""" - - name: str - """An optional name for the participant. - - Provides the model information to differentiate between participants of the - same role. - """ - - tool_call_id: Optional[str] - - tool_calls: Optional[List[dict]] - - class OpenAIBaseModel(BaseModel): # OpenAI API does not allow extra fields model_config = ConfigDict(extra="forbid") @@ -1054,16 +1033,56 @@ class TokenizeCompletionRequest(OpenAIBaseModel): model: str prompt: str - add_special_tokens: bool = Field(default=True) + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt."), + ) class TokenizeChatRequest(OpenAIBaseModel): model: str messages: List[ChatCompletionMessageParam] - add_generation_prompt: bool = Field(default=True) - continue_final_message: bool = Field(default=False) - add_special_tokens: bool = Field(default=False) + add_generation_prompt: bool = Field( + default=True, + description= + ("If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model."), + ) + continue_final_message: bool = Field( + default=False, + description= + ("If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + "This allows you to \"prefill\" part of the model's response for it. " + "Cannot be used at the same time as `add_generation_prompt`."), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)."), + ) + chat_template: Optional[str] = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one."), + ) + chat_template_kwargs: Optional[Dict[str, Any]] = Field( + default=None, + description=("Additional kwargs to pass to the template renderer. " + "Will be accessible by the chat template."), + ) @model_validator(mode="before") @classmethod diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 1b422a93263b2..00cdb3b6839f5 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -222,6 +222,7 @@ async def main(args): prompt_adapters=None, request_logger=request_logger, chat_template=None, + chat_template_content_format="auto", enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.task == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( @@ -230,6 +231,7 @@ async def main(args): base_model_paths, request_logger=request_logger, chat_template=None, + chat_template_content_format="auto", ) if model_config.task == "embedding" else None tracker = BatchProgressTracker() diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 5178481c737b4..2eef909eb9319 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -10,7 +10,8 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template +from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, + ConversationMessage) from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, @@ -30,6 +31,7 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls from vllm.utils import iterate_with_cancellation logger = init_logger(__name__) @@ -37,20 +39,23 @@ class OpenAIServingChat(OpenAIServing): - def __init__(self, - engine_client: EngineClient, - model_config: ModelConfig, - base_model_paths: List[BaseModelPath], - response_role: str, - *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], - request_logger: Optional[RequestLogger], - chat_template: Optional[str], - return_tokens_as_token_ids: bool = False, - enable_auto_tools: bool = False, - tool_parser: Optional[str] = None, - enable_prompt_tokens_details: bool = False): + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + response_role: str, + *, + lora_modules: Optional[List[LoRAModulePath]], + prompt_adapters: Optional[List[PromptAdapterPath]], + request_logger: Optional[RequestLogger], + chat_template: Optional[str], + chat_template_content_format: ChatTemplateContentFormatOption, + return_tokens_as_token_ids: bool = False, + enable_auto_tools: bool = False, + tool_parser: Optional[str] = None, + enable_prompt_tokens_details: bool = False, + ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -60,8 +65,8 @@ def __init__(self, return_tokens_as_token_ids=return_tokens_as_token_ids) self.response_role = response_role - self.use_tool_use_model_template = False - self.chat_template = load_chat_template(chat_template) + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format # set up tool use self.enable_auto_tools: bool = enable_auto_tools @@ -119,6 +124,7 @@ async def create_chat_completion( ) = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) + tool_parser = self.tool_parser # validation for OpenAI tools @@ -127,41 +133,11 @@ async def create_chat_completion( return self.create_error_response( "tool_choice = \"required\" is not supported!") - # NOTE: There is currently a bug in pydantic where attributes - # declared as iterables are replaced in in the instances by - # pydantic-core ValidatorIterator instance. In particular, this - # affects tool_calls defined in ChatCompletionAssistantMessageParam - # model: - # see: - # - https://github.com/pydantic/pydantic/issues/9467 - # As a result, tool_calls from assistant messages are never - # deserialized in the request object if the tool_calls iterator is - # not consumed. This affect messages passed to the MistralTokenizer - # since no chat template is applied and therefore the tools_calls - # iterator is not directly consumed. - # Issue is tracked on Pydantic side, with resolution planned for - # v2.11 release. In the meantime, the official workaround is to - # consume the iterator so the tool_calls are correctly deserialized - # in the OpenAI ChatCompletionAssistantMessageParam object - # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501 - # Official Pydantic Issues: - # - https://github.com/pydantic/pydantic/issues/9541 - # TODO: remove when pydantic v2.11 is released + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` if isinstance(tokenizer, MistralTokenizer): - for i, message in enumerate(request.messages): - if message.get("role") == 'assistant': - tool_calls_validator = message.get( - "tool_calls", ().__iter__()) - validated_tool_calls = [] - while True: - try: - tool_call = next( - tool_calls_validator) # type: ignore - validated_tool_calls.append(tool_call) - except StopIteration: - break - request.messages[i][ - "tool_calls"] = validated_tool_calls + maybe_serialize_tool_calls(request) if (request.tool_choice == "auto" and not (self.enable_auto_tools and tool_parser is not None) @@ -186,6 +162,7 @@ async def create_chat_completion( tokenizer, request.messages, chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self.chat_template_content_format, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, tool_dicts=tool_dicts, diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index bbe7db8f13231..74ad7389784fc 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,7 +1,7 @@ import asyncio import base64 import time -from typing import AsyncGenerator, List, Literal, Optional, Union, cast +from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast import numpy as np from fastapi import Request @@ -9,7 +9,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import load_chat_template +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, EmbeddingRequest, @@ -77,7 +77,8 @@ def __init__( *, request_logger: Optional[RequestLogger], chat_template: Optional[str], - ): + chat_template_content_format: ChatTemplateContentFormatOption, + ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -85,7 +86,8 @@ def __init__( prompt_adapters=None, request_logger=request_logger) - self.chat_template = load_chat_template(chat_template) + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format async def create_embedding( self, @@ -144,6 +146,8 @@ async def create_embedding( tokenizer, request.messages, chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self. + chat_template_content_format, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, truncate_prompt_tokens=truncate_prompt_tokens, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index fa315fa516632..cae2877ea7e99 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -11,14 +11,16 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + ChatTemplateContentFormatOption, ConversationMessage, apply_hf_chat_template, apply_mistral_chat_template, - parse_chat_messages_futures) + parse_chat_messages_futures, + resolve_chat_template_content_format) from vllm.entrypoints.logger import RequestLogger -# yapf conflicts with isort for this block -# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest, DetokenizeRequest, @@ -426,7 +428,8 @@ async def _preprocess_chat( request: ChatLikeRequest, tokenizer: AnyTokenizer, messages: List[ChatCompletionMessageParam], - chat_template: Optional[str] = None, + chat_template: Optional[str], + chat_template_content_format: ChatTemplateContentFormatOption, add_generation_prompt: bool = True, continue_final_message: bool = False, tool_dicts: Optional[List[Dict[str, Any]]] = None, @@ -437,10 +440,16 @@ async def _preprocess_chat( add_special_tokens: bool = False, ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt], List[TokensPrompt]]: + resolved_content_format = resolve_chat_template_content_format( + chat_template, + chat_template_content_format, + tokenizer, + ) conversation, mm_data_future = parse_chat_messages_futures( messages, self.model_config, tokenizer, + content_format=resolved_content_format, ) _chat_template_kwargs: Dict[str, Any] = dict( diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 1fd82304f7a4d..59b3b1311f881 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,8 +1,8 @@ -from typing import List, Optional, Union +from typing import Final, List, Optional, Union from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import load_chat_template +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -33,7 +33,8 @@ def __init__( lora_modules: Optional[List[LoRAModulePath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], - ): + chat_template_content_format: ChatTemplateContentFormatOption, + ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -41,12 +42,8 @@ def __init__( prompt_adapters=None, request_logger=request_logger) - # If this is None we use the tokenizer's default chat template - # the list of commonly-used chat template names for HF named templates - hf_chat_templates: List[str] = ['default', 'tool_use'] - self.chat_template = chat_template \ - if chat_template in hf_chat_templates \ - else load_chat_template(chat_template) + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format async def create_tokenize( self, @@ -75,9 +72,12 @@ async def create_tokenize( request, tokenizer, request.messages, - chat_template=self.chat_template, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self. + chat_template_content_format, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, + chat_template_kwargs=request.chat_template_kwargs, add_special_tokens=request.add_special_tokens, ) else: diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 5ce31bd4d941b..aa7c201098935 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,5 +1,3 @@ -import importlib -import importlib.util import os from functools import cached_property from typing import Callable, Dict, List, Optional, Sequence, Type, Union @@ -9,7 +7,7 @@ ExtractedToolCallInformation) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import is_list_of +from vllm.utils import import_from_path, is_list_of logger = init_logger(__name__) @@ -149,13 +147,14 @@ def _register(module): @classmethod def import_tool_parser(cls, plugin_path: str) -> None: """ - Import a user defined tool parser by the path of the tool parser define + Import a user-defined tool parser by the path of the tool parser define file. """ module_name = os.path.splitext(os.path.basename(plugin_path))[0] - spec = importlib.util.spec_from_file_location(module_name, plugin_path) - if spec is None or spec.loader is None: - logger.error("load %s from %s failed.", module_name, plugin_path) + + try: + import_from_path(module_name, plugin_path) + except Exception: + logger.exception("Failed to load module '%s' from %s.", + module_name, plugin_path) return - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index f5c0d92f3f9bd..5caac84138e3b 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -62,7 +62,7 @@ def __init__(self, tokenizer: AnyTokenizer): ] # map what has been streamed for each tool so far to a list self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) - self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL) + self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) if self.bot_token_id is None: raise RuntimeError( "Mistral Tool Parser could not locate the tool call token in " @@ -84,16 +84,25 @@ def extract_tool_calls( return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) + + # first remove the BOT token + tool_content = model_output.replace(self.bot_token, "").strip() + try: - # use a regex to find the tool call. remove the BOT token - # and make sure to replace single quotes with double quotes - raw_tool_call = self.tool_call_regex.findall( - model_output.replace(self.bot_token, ""))[0] + # we first try to directly load the json as parsing very nested + # jsons is difficult + try: + function_call_arr = json.loads(tool_content) + except json.JSONDecodeError: + # use a regex to find the part corresponding to the tool call. + # NOTE: This use case should not happen if the model is trained + # correctly. It's a easy possible fix so it's included, but + # can be brittle for very complex / highly nested tool calls + raw_tool_call = self.tool_call_regex.findall(tool_content)[0] + function_call_arr = json.loads(raw_tool_call) - # load the JSON, and then use it to build the Function and # Tool Call - function_call_arr = json.loads(raw_tool_call) tool_calls: List[MistralToolCall] = [ MistralToolCall( type="function", @@ -116,7 +125,7 @@ def extract_tool_calls( # return information to just treat the tool call as regular JSON return ExtractedToolCallInformation(tools_called=False, tool_calls=[], - content=model_output) + content=tool_content) def extract_tool_calls_streaming( self, diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index fdf28615fda10..aacff87df6d79 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -67,7 +67,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: model config is unavailable. ''' - if not self.is_encoder_decoder_model(): + if not self.model_config.is_encoder_decoder: print_warning_once("Using None for decoder start token id because " "this is not an encoder/decoder model.") return None @@ -632,7 +632,7 @@ def preprocess( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return self._process_encoder_decoder_prompt( @@ -660,7 +660,7 @@ async def preprocess_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return await self._process_encoder_decoder_prompt_async( @@ -679,6 +679,3 @@ async def preprocess_async( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) - - def is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 04fc635828d4d..3443c3feb4d2a 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -165,15 +165,14 @@ class MergedColumnParallelLinearWithShardedLoRA( def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: - if lora_a[0] is None or lora_a[1] is None: - return lora_a + #NOTE: lora_a contains 2 subloras, and each sublora could be None. output_shard_size = self.lora_a_stacked[0].shape[2] output_start_idx = self.tp_rank * output_shard_size lora_a = [ - lora_a[0][:, - output_start_idx:output_start_idx + output_shard_size], - lora_a[1][:, - output_start_idx:output_start_idx + output_shard_size], + lora_a[0][:, output_start_idx:output_start_idx + + output_shard_size] if lora_a[0] is not None else None, + lora_a[1][:, output_start_idx:output_start_idx + + output_shard_size] if lora_a[1] is not None else None, ] return lora_a @@ -261,14 +260,16 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: - if lora_a[0] is None or lora_a[1] is None or lora_a[2] is None: - return lora_a + # NOTE: lora_a contains 3 subloras, and each sublora could be None. shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] start_idx = [self.tp_rank * shard_size[i] for i in range(3)] lora_a = [ - lora_a[0][:, start_idx[0]:start_idx[0] + shard_size[0]], - lora_a[1][:, start_idx[1]:start_idx[1] + shard_size[1]], - lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]], + lora_a[0][:, start_idx[0]:start_idx[0] + + shard_size[0]] if lora_a[0] is not None else None, + lora_a[1][:, start_idx[1]:start_idx[1] + + shard_size[1]] if lora_a[1] is not None else None, + lora_a[2][:, start_idx[2]:start_idx[2] + + shard_size[2]] if lora_a[2] is not None else None, ] return lora_a diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 7429c60e0222d..6afe80219fe07 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -685,26 +685,27 @@ def slice_lora_a( def slice_lora_b( self, lora_b: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: - if lora_b[0] is None or lora_b[1] is None: - return lora_b + #NOTE: lora_b contains 2 subloras, and each sublora could be None. shard_size = self.output_dim start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size lora_b = [ - lora_b[0][:, start_idx:end_idx], - lora_b[1][:, start_idx:end_idx], + lora_b[0][:, start_idx:end_idx] if lora_b[0] is not None else None, + lora_b[1][:, start_idx:end_idx] if lora_b[1] is not None else None, ] return lora_b def slice_bias( self, bias: List[Union[torch.Tensor, None]]) -> List[Union[torch.Tensor, None]]: - if bias[0] is None or bias[1] is None: - return bias + # NOTE : each bias could be None. shard_size = self.output_dim start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - bias = [bias[0][start_idx:end_idx], bias[1][start_idx:end_idx]] + bias = [ + bias[0][start_idx:end_idx] if bias[0] is not None else None, + bias[1][start_idx:end_idx] if bias[1] is not None else None + ] return bias def set_lora( diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 340da32263c1c..e6f9f01ef0f74 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -105,16 +105,18 @@ def fused_moe_kernel( num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: return - offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int64) offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) token_mask = offs_token < num_valid_tokens - offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N offs_k = tl.arange(0, BLOCK_SIZE_K) a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak) - off_experts = tl.load(expert_ids_ptr + pid_m) + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) if use_int8_w8a16: diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 024badbc17b96..6fee57a0a03eb 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -63,14 +63,14 @@ def from_config_with_defaults( return cls( pooling_type=PoolingType[pooler_config.pooling_type] if pooler_config.pooling_type is not None else pooling_type, - normalize=pooler_config.pooling_norm - if pooler_config.pooling_norm is not None else normalize, - softmax=pooler_config.pooling_softmax - if pooler_config.pooling_softmax is not None else softmax, - step_tag_id=pooler_config.pooling_step_tag_id - if pooler_config.pooling_step_tag_id is not None else step_tag_id, - returned_token_ids=pooler_config.pooling_returned_token_ids - if pooler_config.pooling_returned_token_ids is not None else + normalize=pooler_config.normalize + if pooler_config.normalize is not None else normalize, + softmax=pooler_config.softmax + if pooler_config.softmax is not None else softmax, + step_tag_id=pooler_config.step_tag_id + if pooler_config.step_tag_id is not None else step_tag_id, + returned_token_ids=pooler_config.returned_token_ids + if pooler_config.returned_token_ids is not None else returned_token_ids, ) @@ -94,10 +94,14 @@ def forward( pooled_data = hidden_states[last_token_flat_indices] elif self.pooling_type == PoolingType.ALL: offset = 0 - pooled_data = [] + pooled_data_lst = [] for prompt_len in prompt_lens: - pooled_data.append(hidden_states[offset:offset + prompt_len]) + pooled_data_i = hidden_states[offset:offset + prompt_len] + + pooled_data_lst.append(pooled_data_i) offset += prompt_len + + pooled_data = torch.stack(pooled_data_lst) elif self.pooling_type == PoolingType.MEAN: # Calculate mean pooling cumsum = torch.cumsum(hidden_states, dim=0) @@ -110,24 +114,26 @@ def forward( cumsum[end_indices - 1] - cumsum[start_indices] + hidden_states[start_indices]) / prompt_lens.unsqueeze(1) elif self.pooling_type == PoolingType.STEP: - if self.returned_token_ids is not None and len( - self.returned_token_ids) > 0: - logits = hidden_states[:, - self.returned_token_ids].softmax(dim=-1) - else: - logits = hidden_states.softmax(dim=-1) + returned_token_ids = self.returned_token_ids + if returned_token_ids is not None and len(returned_token_ids) > 0: + hidden_states = hidden_states[:, returned_token_ids] + + logits = hidden_states.softmax(dim=-1) + step_tag_id = self.step_tag_id + offset = 0 - pooled_data = [] + pooled_data_lst = [] for prompt_len, seq_data_i in zip( prompt_lens, pooling_metadata.seq_data.values()): - if self.step_tag_id is None: - pooled_data.append(logits[offset:offset + prompt_len]) - else: - step_idxs = torch.tensor( - seq_data_i.prompt_token_ids) == self.step_tag_id - pooled_data.append(logits[offset:offset + - prompt_len][step_idxs]) + pooled_data_i = logits[offset:offset + prompt_len] + if step_tag_id is not None: + token_ids = torch.tensor(seq_data_i.prompt_token_ids) + pooled_data_i = pooled_data_i[token_ids == step_tag_id] + offset += prompt_len + pooled_data_lst.append(pooled_data_i) + + pooled_data = torch.stack(pooled_data_lst) else: raise ValueError(f"Invalid pooling type: {self.pooling_type}") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 15d9cdbcbb86b..6cbc58d61e970 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -82,9 +82,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md if not self.input_symmetric: - layer.azp_adj = layer.weight.sum(dim=0, - keepdim=True, - dtype=torch.int32) + azp_adj = layer.weight.sum(dim=0, keepdim=True, dtype=torch.int32) + if self.is_static_input_scheme: + # cutlass_w8a8 requires azp to be folded into azp_adj + # in the per-tensor case + azp_adj = layer.input_zero_point * azp_adj + + layer.azp_adj = azp_adj else: layer.azp_adj = None @@ -138,7 +142,6 @@ def create_weights(self, layer: torch.nn.Module, def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: - return apply_int8_linear(input=x, weight=layer.weight, weight_scale=layer.weight_scale, diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index ec73533126ab6..4037bcb963b25 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -211,13 +211,16 @@ def apply_int8_linear( symmetric=symmetric) if x_zp is not None: + # Currently, static is always per-tensor and dynamic is per-token + static = input_zero_point is not None + azp = None if static else x_zp return ops.cutlass_scaled_mm_azp(x_q, weight, scale_a=x_scale, scale_b=weight_scale, out_dtype=input.dtype, azp_adj=azp_adj, - azp=x_zp, + azp=azp, bias=bias) return ops.cutlass_scaled_mm(x_q, weight, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 906c151d28eb5..a95599b9606da 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -96,18 +96,34 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: model_config = vllm_config.model_config model_class, _ = get_model_architecture(model_config) signatures = inspect.signature(model_class.__init__) - # collect all kw-only parameters - kw_only_params = [ - param.name for param in signatures.parameters.values() - if param.kind == inspect.Parameter.KEYWORD_ONLY - ] - assert "vllm_config" in kw_only_params and "prefix" in kw_only_params, \ - ("vLLM model class must accept `vllm_config` and `prefix` as kw-only " - "arguments. Possibly you have an old-style model class registered from " - "out of tree and it is used for new vLLM version. " - "Please check https://docs.vllm.ai/en/latest/design/class_hierarchy.html " - "for the design and update the model class accordingly.") - return model_class(vllm_config=vllm_config, prefix=prefix) + all_params = [param.name for param in signatures.parameters.values()] + if "vllm_config" in all_params and "prefix" in all_params: + # new-style model class + return model_class(vllm_config=vllm_config, prefix=prefix) + msg = ("vLLM model class should accept `vllm_config` and `prefix` as " + "input arguments. Possibly you have an old-style model class" + " registered from out of tree and it is used for new vLLM version. " + "Check https://docs.vllm.ai/en/latest/design/class_hierarchy.html " + "for the design and update the model class accordingly.") + logger.warning(msg) + logger.warning( + "Trying to guess the arguments for old-style model class %s", + model_class) + # try to be compatible with old-style model class + kwargs = {} + if "prefix" in all_params: + kwargs["prefix"] = prefix + if "config" in all_params: + kwargs["config"] = model_config.hf_config + if "cache_config" in all_params: + kwargs["cache_config"] = vllm_config.cache_config + if "quant_config" in all_params: + kwargs["quant_config"] = vllm_config.quant_config + if "lora_config" in all_params: + kwargs["lora_config"] = vllm_config.lora_config + if "scheduler_config" in all_params: + kwargs["scheduler_config"] = vllm_config.scheduler_config + return model_class(**kwargs) class BaseModelLoader(ABC): diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 7dbc7fa0aaba4..42dd6119e76f1 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -5,7 +5,7 @@ from transformers import BertConfig from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -305,14 +305,16 @@ def forward(self, hidden_states: torch.Tensor, class BertModel(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + embedding_class: type = BertEmbedding): super().__init__() - config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - - self.embeddings = BertEmbedding(config) + self.embeddings = embedding_class(config) self.encoder = BertEncoder(config, cache_config, quant_config, @@ -382,13 +384,9 @@ class BertEmbeddingModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() pooler_config = vllm_config.model_config.pooler_config - self.model = BertModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.CLS, - normalize=True, - softmax=False) + self.model = self._build_model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self._pooler = self._build_pooler(pooler_config) def forward( self, @@ -415,3 +413,16 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): self.model.load_weights(weights) + + def _build_model(self, + vllm_config: VllmConfig, + prefix: str = "") -> BertModel: + return BertModel(vllm_config=vllm_config, + prefix=prefix, + embedding_class=BertEmbedding) + + def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: + return Pooler.from_config_with_defaults(pooler_config, + pooling_type=PoolingType.CLS, + normalize=True, + softmax=False) diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index dcfcb6694feb5..b3dbf063ac298 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -250,6 +250,9 @@ def __init__( self.mlp = FalconMLP(config, quant_config) self.config = config + if (not hasattr(config, "num_ln_in_parallel_attn")): + config.num_ln_in_parallel_attn = None + if (config.num_ln_in_parallel_attn is None and config.new_decoder_architecture): config.num_ln_in_parallel_attn = 2 diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 50701793b7b83..31fc098a8bb3f 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -41,7 +41,8 @@ from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP -from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings +from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, + merge_multimodal_embeddings) # Cannot find the following 2 numbers from hf config. _IMAGE_TOKEN_ID = 71011 @@ -245,7 +246,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): gather_output=True, ) self.language_model = PersimmonForCausalLM( - vllm_config.with_hf_config(config.text_config)) + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=maybe_prefix(prefix, "language_model"), + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 34889d691a934..f1b7c896cadfe 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -161,11 +161,5 @@ class InternLM2VEForCausalLM(InternLM2ForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.model = InternLM2VEModel(config, - cache_config, - quant_config, + self.model = InternLM2VEModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8aed0fead18f9..e53631ef19f31 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -90,8 +90,8 @@ def __init__( self.act_fn = SiluAndMul() def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) + x, _ = self.gate_up_proj(x) + x = self.act_fn(x) x, _ = self.down_proj(x) return x diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 999739ccd98bf..fd8eda997f76f 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -382,11 +382,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): instantiated. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config quant_config = vllm_config.quant_config @@ -699,12 +695,8 @@ def is_default_weight_loading(self, name: str) -> bool: class MiniCPMV2_0(MiniCPMVBaseModel): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__(vllm_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (2, 0) def init_llm( @@ -857,12 +849,8 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__(vllm_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (2, 5) def init_llm( @@ -999,12 +987,8 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__(vllm_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (2, 6) def init_llm( @@ -1117,7 +1101,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] - def __new__(cls, vllm_config: VllmConfig, prefix: str = ""): + def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config if not hasattr(config, "version"): if config.hidden_size == 2304 and config.query_num == 64: diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 6aa43f22f4c93..4d7e82880041d 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -65,7 +65,7 @@ class MLPSpeculator(nn.Module): https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite """ - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config self.n_predict = config.n_predict diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b623c576bb673..431e397e1e10d 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -37,6 +37,7 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -44,8 +45,9 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, @@ -247,6 +249,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + # TODO (@robertgshaw2): see if this can be moved out + if (cache_config.sliding_window is not None + and hasattr(config, "max_window_layers")): + raise ValueError("Sliding window for some but all layers is not " + "supported. This model uses sliding window " + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + )) + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -405,20 +419,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) + pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -438,6 +441,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() + + # The same model class supports both language generation and embedding + # because the architecture name is the same + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False) + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -475,6 +487,13 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, @@ -482,3 +501,70 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if self.config.tie_word_embeddings else None), ) loader.load_weights(weights) + + +class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + pooler_config = vllm_config.model_config.pooler_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = Qwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.MEAN, + normalize=True, + softmax=False) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> torch.Tensor: + return self.model(input_ids, positions, kv_caches, attn_metadata, + intermediate_tensors) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self, + ignore_unexpected_prefixes=["lm_head."]) + loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 27eb7e8a93975..120403e948686 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -17,10 +17,11 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput +from .interfaces import SupportsLoRA, SupportsPP from .utils import AutoWeightsLoader, maybe_prefix -class Qwen2ForSequenceClassification(nn.Module): +class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -46,21 +47,9 @@ class Qwen2ForSequenceClassification(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config pooler_config = vllm_config.model_config.pooler_config - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) self.config = config self.lora_config = lora_config diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 89768ec9dff37..55843d8325348 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -16,7 +16,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .qwen2 import Qwen2Model from .utils import AutoWeightsLoader, maybe_prefix @@ -32,7 +32,7 @@ def forward(self, input): return self.activation(input) -class Qwen2ForRewardModel(nn.Module, SupportsPP): +class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -58,21 +58,9 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config pooler_config = vllm_config.model_config.pooler_config - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) self.config = config self.lora_config = lora_config diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f172c06c4a26a..22c2e328bfb65 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,3 +1,7 @@ +""" +Whenever you add an architecture to this page, please also update +`tests/models/registry.py` with example HuggingFace models for it. +""" import importlib import os import pickle @@ -7,7 +11,8 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field from functools import lru_cache -from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union +from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type, + TypeVar, Union) import cloudpickle import torch.nn as nn @@ -58,14 +63,14 @@ "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), + "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), + "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), # transformers's mpt class has lower case "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), - "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), - "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"), @@ -94,6 +99,8 @@ _EMBEDDING_MODELS = { # [Text-only] "BertModel": ("bert", "BertEmbeddingModel"), + "RobertaModel": ("roberta", "RobertaEmbeddingModel"), + "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), "LlamaModel": ("llama", "LlamaEmbeddingModel"), @@ -104,6 +111,8 @@ }, "MistralModel": ("llama", "LlamaEmbeddingModel"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), + "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), + "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501 # [Multimodal] @@ -295,8 +304,8 @@ class _ModelRegistry: # Keyed by model_arch models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) - def get_supported_archs(self) -> List[str]: - return list(self.models.keys()) + def get_supported_archs(self) -> AbstractSet[str]: + return self.models.keys() def register_model( self, diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py new file mode 100644 index 0000000000000..c1dcdd36ec3de --- /dev/null +++ b/vllm/model_executor/models/roberta.py @@ -0,0 +1,117 @@ +from typing import List, Optional + +import torch +from torch import nn +from transformers import RobertaConfig + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel +from vllm.sequence import IntermediateTensors + + +class RobertaEmbedding(nn.Module): + + def __init__(self, config: RobertaConfig): + super().__init__() + self.size = config.hidden_size + self.word_embeddings = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size, + padding_idx=self.padding_idx) + + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.position_ids = nn.Parameter( + torch.empty((1, config.max_position_embeddings)), ) + + self.position_embedding_type = config.position_embedding_type + if self.position_embedding_type != "absolute": + raise ValueError("Only 'absolute' position_embedding_type" + + " is supported") + + def forward( + self, + input_ids: torch.Tensor, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + input_shape = input_ids.size() + + # Input embeddings. + inputs_embeds = self.word_embeddings(input_ids) + + # TODO: figure out if there is a better way + # to make to make position ids start at padding_idx + 1 + # References: + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 + position_ids += self.padding_idx + 1 + + # Position embeddings. + position_embeddings = self.position_embeddings(position_ids) + + # Token type embeddings. (TODO: move off hotpath?) + token_type_embeddings = self.token_type_embeddings( + torch.zeros(input_shape, + dtype=torch.long, + device=inputs_embeds.device)) + + embeddings = inputs_embeds + token_type_embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) + return embeddings + + +class RobertaEmbeddingModel(BertEmbeddingModel): + """A model that uses Roberta to provide embedding functionalities. + + This class encapsulates the BertModel and provides an interface for + embedding operations and customized pooling functions. + + Attributes: + model: An instance of BertModel used for forward operations. + _pooler: An instance of Pooler used for pooling operations. + """ + + def _build_model(self, + vllm_config: VllmConfig, + prefix: str = "") -> BertModel: + return BertModel(vllm_config=vllm_config, + prefix=prefix, + embedding_class=RobertaEmbedding) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + # Verify assumption that position are always a sequence from + # 0 to N. (Actually here we just check 0 and N to simplify). + # This is important to fix the position which are assumed to + # start from padding_idx + 1 instead of 0 in the Roberta models. + assert hasattr(attn_metadata, "seq_lens_tensor") + cumulative = attn_metadata.seq_lens_tensor.cumsum(dim=0) + start_pos = torch.cat( + (torch.tensor([0], device=attn_metadata.seq_lens_tensor.device), + cumulative[:-1])) + assert len(torch.nonzero(positions[start_pos])) == 0 + end_pos = cumulative - 1 + last_tokens = attn_metadata.seq_lens_tensor - 1 + assert len(torch.nonzero(positions[end_pos] - last_tokens)) == 0 + + return super().forward(input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 8373e11cfff9f..7b1bbb14c5302 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -27,16 +27,27 @@ def load_general_plugins(): allowed_plugins = envs.VLLM_PLUGINS discovered_plugins = entry_points(group='vllm.general_plugins') + if len(discovered_plugins) == 0: + logger.info("No plugins found.") + return + logger.info("Available plugins:") + for plugin in discovered_plugins: + logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value, + plugin.group) + if allowed_plugins is None: + logger.info("all available plugins will be loaded.") + logger.info("set environment variable VLLM_PLUGINS to control" + " which plugins to load.") + else: + logger.info("plugins to load: %s", allowed_plugins) for plugin in discovered_plugins: - logger.info("Found general plugin: %s", plugin.name) if allowed_plugins is None or plugin.name in allowed_plugins: try: func = plugin.load() func() - logger.info("Loaded general plugin: %s", plugin.name) + logger.info("plugin %s loaded.", plugin.name) except Exception: - logger.exception("Failed to load general plugin: %s", - plugin.name) + logger.exception("Failed to load plugin %s", plugin.name) _torch_compile_backend: Optional[Union[Callable, str]] = None diff --git a/vllm/scripts.py b/vllm/scripts.py index 4e4c071784287..a51c21cfa29e7 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -9,6 +9,7 @@ from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam +import vllm.version from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.openai.api_server import run_server from vllm.entrypoints.openai.cli_args import (make_arg_parser, @@ -143,6 +144,11 @@ def main(): env_setup() parser = FlexibleArgumentParser(description="vLLM CLI") + parser.add_argument('-v', + '--version', + action='version', + version=vllm.version.__version__) + subparsers = parser.add_subparsers(required=True, dest="subparser") serve_parser = subparsers.add_parser( diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 6a7929d9d8f9c..25ef27b8378f0 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -353,6 +353,7 @@ def _create_single_target_seq_group_metadata( seq_data = seq_group_metadata.seq_data[seq_id] prompt_token_ids = seq_data.prompt_token_ids_array new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids] + mrope_position_delta = seq_data.mrope_position_delta new_seq_data_dict = { target_seq_id: @@ -368,6 +369,7 @@ def _create_single_target_seq_group_metadata( # the kv cache is filled by a previous batch in the batch expansion. for data in new_seq_data_dict.values(): data.update_num_computed_tokens(data.get_len() - 1) + data.mrope_position_delta = mrope_position_delta return SequenceGroupMetadata( request_id=seq_group_metadata.request_id, diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py index 5f437d414e181..e68ad79b296b8 100644 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -1,3 +1,3 @@ -from .mistral import MistralTokenizer +from .mistral import MistralTokenizer, maybe_serialize_tool_calls -__all__ = ["MistralTokenizer"] +__all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"] diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 1b273c6b120ea..83b3c37d6f04c 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -7,6 +7,7 @@ import huggingface_hub from huggingface_hub import HfApi, hf_hub_download from mistral_common.protocol.instruct.request import ChatCompletionRequest +from mistral_common.tokens.tokenizers.base import SpecialTokens # yapf: disable from mistral_common.tokens.tokenizers.mistral import ( MistralTokenizer as PublicMistralTokenizer) @@ -29,6 +30,43 @@ class Encoding: input_ids: List[int] +def maybe_serialize_tool_calls(request: ChatCompletionRequest): + # SEE: https://github.com/vllm-project/vllm/pull/9951 + # Credits go to: @gcalmettes + # NOTE: There is currently a bug in pydantic where attributes + # declared as iterables are replaced in in the instances by + # pydantic-core ValidatorIterator instance. In particular, this + # affects tool_calls defined in ChatCompletionAssistantMessageParam + # model: + # see: + # - https://github.com/pydantic/pydantic/issues/9467 + # As a result, tool_calls from assistant messages are never + # deserialized in the request object if the tool_calls iterator is + # not consumed. This affect messages passed to the MistralTokenizer + # since no chat template is applied and therefore the tools_calls + # iterator is not directly consumed. + # Issue is tracked on Pydantic side, with resolution planned for + # v2.11 release. In the meantime, the official workaround is to + # consume the iterator so the tool_calls are correctly deserialized + # in the OpenAI ChatCompletionAssistantMessageParam object + # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501 + # Official Pydantic Issues: + # - https://github.com/pydantic/pydantic/issues/9541 + # TODO: remove when pydantic v2.11 is released + for i, message in enumerate(request.messages): + if message.get("role") == 'assistant': + tool_calls_validator = message.get("tool_calls", ().__iter__()) + validated_tool_calls = [] + while True: + try: + tool_call = next(tool_calls_validator) # type: ignore + validated_tool_calls.append(tool_call) + except StopIteration: + break + + request.messages[i]["tool_calls"] = validated_tool_calls + + def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: repo_cache = os.path.join( huggingface_hub.constants.HF_HUB_CACHE, @@ -136,18 +174,29 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str, revision=revision) return tokenizer_file - # the following attributes are set to fit VLLM's design + # the following attributes are set to fit VLLM's design and are used + # by the guided structured output backends. @property def all_special_tokens_extended(self) -> List[str]: - return [] + # tekken defines its own extended special tokens list + if hasattr(self.tokenizer, "SPECIAL_TOKENS"): + special_tokens = self.tokenizer.SPECIAL_TOKENS + else: + special_tokens = list(SpecialTokens) + return [ + s.value if isinstance(s, SpecialTokens) else s + for s in special_tokens + ] @property def all_special_tokens(self) -> List[str]: - return [] + return self.all_special_tokens_extended @property def all_special_ids(self) -> List[int]: - return [] + return [ + self.all_special_tokens.index(t) for t in self.all_special_tokens + ] @property def bos_token_id(self) -> int: @@ -222,7 +271,8 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str: if self.is_tekken: tokens = [ t for t in tokens - if t not in self.tokenizer._all_special_tokens + if (t is SpecialTokens.tool_calls + or t not in self.tokenizer._all_special_tokens) ] if any(isinstance(t, bytes) for t in tokens): @@ -246,7 +296,27 @@ def _token_to_id(t: str): else: decoded = "".join(tokens) else: - decoded = self.tokenizer.decode(tokens) # type: ignore[arg-type] + # make sure certain special tokens like Tool calls are + # not decoded + special_tokens = {SpecialTokens.tool_calls} + regular_tokens: List[str] = [] + decoded_list = [] + + for token in tokens: + if token in special_tokens: + if regular_tokens: + decoded_list.append( + self.tokenizer.decode(regular_tokens)) + regular_tokens = [] + decoded_list.append(token) + else: + regular_tokens.append(token) + + if regular_tokens: + decoded_list.append( + self.decode(regular_tokens)) # type: ignore + + decoded = ''.join(decoded_list) return decoded @@ -274,8 +344,11 @@ def convert_ids_to_tokens( assert self.is_tekken or self.is_spm, type(self.tokenizer) if self.is_tekken: - # skip special tokens - ids = [i for i in ids if i > self.tokenizer.num_special_tokens] + # skip special tokens except tool call + ids = [ + i for i in ids if i > self.tokenizer.num_special_tokens or i == + self.tokenizer.get_control_token(SpecialTokens.tool_calls) + ] tokens = [self.tokenizer.id_to_piece(id) for id in ids] diff --git a/vllm/utils.py b/vllm/utils.py index 1b02cbff79f78..111460a29de47 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -5,6 +5,7 @@ import enum import gc import getpass +import importlib.util import inspect import ipaddress import os @@ -1539,6 +1540,25 @@ def is_in_doc_build() -> bool: return False +def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): + """ + Import a Python file according to its file path. + + Based on the official recipe: + https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly + """ + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ModuleNotFoundError(f"No module named '{module_name}'") + + assert spec.loader is not None + + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + # create a library to hold the custom op vllm_lib = Library("vllm", "FRAGMENT") # noqa diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4ebfff9584267..75a77be750acd 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -163,9 +163,6 @@ def step(self) -> List[RequestOutput]: def get_model_config(self): pass - def is_encoder_decoder_model(self): - pass - def start_profile(self): pass diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d8c8011a585d8..d3ca6d9d0b17e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -232,7 +232,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: logger.info( "Memory profiling results: total_gpu_memory=%.2fGiB" " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB" - " memory_usage_post_profile=%.2fGib" + " memory_usage_post_profile=%.2fGiB" " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB" " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3), (total_gpu_memory - free_memory_pre_profile) / (1024**3),