Merge branch 'vllm-project:main' into main

vllm-project · Dec 10, 2024 · 33d737a · 33d737a
2 parents 111ff87 + d1f6d1c
commit 33d737a
Show file tree

Hide file tree

Showing 48 changed files with 2,084 additions and 1,720 deletions.
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -39,3 +39,19 @@ steps:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build and publish TPU release image"
+    depends_on: ~
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: tpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "docker push vllm/vllm-tpu:nightly"
+      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllm
+          password-env: DOCKERHUB_TOKEN
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -1,5 +1,6 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
 
 FROM $BASE_IMAGE
 

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -307,10 +307,20 @@ void reshape_and_cache_flash(
     torch::Tensor& key_cache,  // [num_blocks, block_size, num_heads, head_size]
     torch::Tensor&
         value_cache,  // [num_blocks, block_size, num_heads, head_size]
-    torch::Tensor& slot_mapping,  // [num_tokens]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
     const std::string& kv_cache_dtype, const double k_scale,
     const double v_scale) {
-  int num_tokens = key.size(0);
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
   int block_size = key_cache.size(1);

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -102,6 +102,7 @@ Documentation
 
    usage/lora
    usage/multimodal_inputs
+   usage/tool_calling
    usage/structured_outputs
    usage/spec_decode
    usage/compatibility_matrix

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -495,7 +495,7 @@ Text Generation
 ---------------
 
 .. list-table::
-  :widths: 25 25 15 25 5 5
+  :widths: 25 25 15 20 5 5 5
   :header-rows: 1
 
   * - Architecture
@@ -504,144 +504,168 @@ Text Generation
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+    - V1
   * - :code:`AriaForConditionalGeneration`
     - Aria
     - T + I
     - :code:`rhymes-ai/Aria`
     - 
     - ✅︎
+    - 
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
     - T + I\ :sup:`E`
     - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
     -
     - ✅︎
+    - 
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - T + I
     - :code:`facebook/chameleon-7b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`FuyuForCausalLM`
     - Fuyu
     - T + I
     - :code:`adept/fuyu-8b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`ChatGLMModel`
     - GLM-4V
     - T + I
     - :code:`THUDM/glm-4v-9b` etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`H2OVLChatModel`
     - H2OVL
     - T + I\ :sup:`E+`
     - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Idefics3ForConditionalGeneration`
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
     - ✅︎
+    -
     - 
   * - :code:`InternVLChatModel`
     - InternVL 2.5, Mono-InternVL, InternVL 2.0
     - T + I\ :sup:`E+`
     - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - T + I\ :sup:`E+`
     - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT
     - T + I\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
     - T + V
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
     - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - T + I\ :sup:`E+`
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
     - T + I\ :sup:`+`
     - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
     -
     -
+    -
   * - :code:`MolmoForCausalLM`
     - Molmo
     - T + I
     - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`NVLM_D_Model`
     - NVLM-D 1.0
     - T + I\ :sup:`E+`
     - :code:`nvidia/NVLM-D-72B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
     - PaliGemma
     - T + I\ :sup:`E`
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision, Phi-3.5-Vision
     - T + I\ :sup:`E+`
     - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`PixtralForConditionalGeneration`
     - Pixtral
     - T + I\ :sup:`+`
     - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`QWenLMHeadModel`
     - Qwen-VL
     - T + I\ :sup:`E+`
     - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
     - ✅︎
     - ✅︎
+    -
   * - :code:`Qwen2AudioForConditionalGeneration`
     - Qwen2-Audio
     - T + A\ :sup:`+`
     - :code:`Qwen/Qwen2-Audio-7B-Instruct`
     -
     - ✅︎
+    - 
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`UltravoxModel`
     - Ultravox
     - T + A\ :sup:`E+`
     - :code:`fixie-ai/ultravox-v0_3`
     -
     - ✅︎
+    - 
 
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.