From 3fb4b4f1634a896653acc12c72b8e5d6d87a8f82 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 11 Dec 2024 00:39:53 -0800 Subject: [PATCH 01/87] [ci/build] Fix AMD CI dependencies (#11087) --- requirements-rocm.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 121123611d2da..ccc9062341772 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -5,7 +5,8 @@ awscli boto3 botocore +datasets ray >= 2.10.0 peft pytest-asyncio -tensorizer>=2.9.0 \ No newline at end of file +tensorizer>=2.9.0 From 9974fca047bb332ec68377be4579ea515a300d69 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 11 Dec 2024 01:01:53 -0800 Subject: [PATCH 02/87] [ci/build] Fix entrypoints test and pin outlines version (#11088) --- requirements-common.txt | 2 +- .../guided_decoding/outlines_logits_processors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index c71fc458aca13..792cd58e80669 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -18,7 +18,7 @@ prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines >= 0.1.8 +outlines == 0.1.9 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 1f0dbe024609d..b63fed1c8a8c3 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -25,7 +25,7 @@ from outlines import grammars from outlines.caching import cache from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write -from outlines.fsm.json_schema import build_regex_from_schema +from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from transformers import PreTrainedTokenizerBase From 61b1d2f6aef8e29c6a0d795a9c6682d525f4d8cc Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 11 Dec 2024 04:26:36 -0500 Subject: [PATCH 03/87] [Core] v1: Use atexit to handle engine core client shutdown (#11076) Signed-off-by: Russell Bryant --- vllm/v1/engine/core_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ee89cece73141..4d96b323d1662 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,4 @@ +import atexit import multiprocessing from typing import List, Union @@ -157,6 +158,7 @@ def __init__( should_shutdown=self.should_shutdown, **kwargs, ) + atexit.register(self.shutdown) def shutdown(self): # Send shutdown signal to background process. From 2e32f5d28db3cd79f6a421f640e083be1f9468b7 Mon Sep 17 00:00:00 2001 From: B-201 Date: Wed, 11 Dec 2024 17:27:07 +0800 Subject: [PATCH 04/87] [Bugfix] Fix Idefics3 fails during multi-image inference (#11080) Signed-off-by: B-201 --- vllm/model_executor/models/idefics3.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index e5d2edbd81eb1..17e772e7faa32 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -60,7 +60,8 @@ class Idefics3ImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor """ - Shape: `(batch_size * num_images, num_channels, height, width)` + Shape: `(batch_size * num_images * num_patches, + num_channels, height, width)` """ pixel_attention_mask: Optional[torch.BoolTensor] @@ -520,13 +521,17 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - return Idefics3ImagePixelInputs(type="pixel_values", - data=self._validate_pixel_values( - flatten_bn(pixel_values, - concat=True)), - pixel_attention_mask=flatten_bn( - pixel_attention_mask, - concat=True)) + if isinstance(pixel_values, list): + pixel_values = torch.cat(pixel_values, dim=1) + pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1) + else: + pixel_values = flatten_bn(pixel_values) + pixel_attention_mask = flatten_bn(pixel_attention_mask) + + return Idefics3ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + pixel_attention_mask=pixel_attention_mask) raise AssertionError("This line should be unreachable.") From 40766ca1b8b0ef92e220595bda96c4336b597e5b Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 11 Dec 2024 04:27:39 -0500 Subject: [PATCH 05/87] [Bugfix]: Clamp `-inf` logprob values in prompt_logprobs (#11073) Signed-off-by: Rafael Vasquez --- vllm/entrypoints/openai/serving_completion.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c54d5f07cf58c..ee97d35f2b087 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -392,6 +392,12 @@ def request_output_to_completion_response( prompt_token_ids = final_res.prompt_token_ids assert prompt_token_ids is not None prompt_logprobs = final_res.prompt_logprobs + if prompt_logprobs: + for logprob_dict in prompt_logprobs: + if logprob_dict: + for logprob_values in logprob_dict.values(): + if logprob_values.logprob == float('-inf'): + logprob_values.logprob = -9999.0 prompt_text = final_res.prompt token_ids: GenericSequence[int] From 8f10d5e3930f05c2057a831cd80ba24c52b8ceef Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Dec 2024 17:28:00 +0800 Subject: [PATCH 06/87] [Misc] Split up pooling tasks (#10820) Signed-off-by: DarkLight1337 --- docs/source/index.rst | 2 + docs/source/models/generative_models.rst | 146 ++++++++++++++++ docs/source/models/pooling_models.rst | 99 +++++++++++ docs/source/models/supported_models.rst | 157 ++++++++++++------ docs/source/usage/compatibility_matrix.rst | 12 +- examples/offline_inference_embedding.py | 7 +- ...ine_inference_vision_language_embedding.py | 4 +- tests/compile/test_basic_correctness.py | 4 +- tests/core/test_scheduler_encoder_decoder.py | 2 +- .../openai/test_vision_embedding.py | 2 +- .../embedding/language/test_embedding.py | 2 +- .../models/embedding/language/test_scoring.py | 12 +- .../vision_language/test_dse_qwen2_vl.py | 2 +- .../vision_language/test_llava_next.py | 2 +- .../embedding/vision_language/test_phi3v.py | 2 +- tests/test_config.py | 17 +- vllm/config.py | 137 ++++++++++----- vllm/core/scheduler.py | 2 +- vllm/engine/arg_utils.py | 7 +- vllm/engine/llm_engine.py | 4 +- vllm/entrypoints/llm.py | 53 +++--- vllm/entrypoints/openai/api_server.py | 8 +- vllm/entrypoints/openai/run_batch.py | 4 +- vllm/model_executor/model_loader/utils.py | 2 +- vllm/v1/engine/core.py | 2 +- vllm/worker/cpu_worker.py | 2 +- vllm/worker/worker.py | 2 +- 27 files changed, 527 insertions(+), 168 deletions(-) create mode 100644 docs/source/models/generative_models.rst create mode 100644 docs/source/models/pooling_models.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index ebf1361976c5e..842013d6d49c4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -94,6 +94,8 @@ Documentation :caption: Models models/supported_models + models/generative_models + models/pooling_models models/adding_model models/enabling_multimodal_inputs diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst new file mode 100644 index 0000000000000..fb71185600863 --- /dev/null +++ b/docs/source/models/generative_models.rst @@ -0,0 +1,146 @@ +.. _generative_models: + +Generative Models +================= + +vLLM provides first-class support for generative models, which covers most of LLMs. + +In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface. +Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, +which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text. + +Offline Inference +----------------- + +The :class:`~vllm.LLM` class provides various methods for offline inference. +See :ref:`Engine Arguments ` for a list of options when initializing the model. + +For generative models, the only supported :code:`task` option is :code:`"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + +``LLM.generate`` +^^^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM. +It is similar to `its counterpart in HF Transformers `__, +except that tokenization and detokenization are also performed automatically. + +.. code-block:: python + + llm = LLM(model="facebook/opt-125m") + outputs = llm.generate("Hello, my name is") + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +You can optionally control the language generation by passing :class:`~vllm.SamplingParams`. +For example, you can use greedy sampling by setting :code:`temperature=0`: + +.. code-block:: python + + llm = LLM(model="facebook/opt-125m") + params = SamplingParams(temperature=0) + outputs = llm.generate("Hello, my name is", params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +A code example can be found in `examples/offline_inference.py `_. + +``LLM.beam_search`` +^^^^^^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.beam_search` method implements `beam search `__ on top of :class:`~vllm.LLM.generate`. +For example, to search using 5 beams and output at most 50 tokens: + +.. code-block:: python + + llm = LLM(model="facebook/opt-125m") + params = BeamSearchParams(beam_width=5, max_tokens=50) + outputs = llm.generate("Hello, my name is", params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +``LLM.chat`` +^^^^^^^^^^^^ + +The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`. +In particular, it accepts input similar to `OpenAI Chat Completions API `__ +and automatically applies the model's `chat template `__ to format the prompt. + +.. important:: + + In general, only instruction-tuned models have a chat template. + Base models may perform poorly as they are not trained to respond to the chat conversation. + +.. code-block:: python + + llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, + ] + outputs = llm.chat(conversation) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +A code example can be found in `examples/offline_inference_chat.py `_. + +If the model doesn't have a chat template or you want to specify another one, +you can explicitly pass a chat template: + +.. code-block:: python + + from vllm.entrypoints.chat_utils import load_chat_template + + # You can find a list of existing chat templates under `examples/` + custom_template = load_chat_template(chat_template="") + print("Loaded chat template:", custom_template) + + outputs = llm.chat(conversation, chat_template=custom_template) + +Online Inference +---------------- + +Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. +Please click on the above link for more details on how to launch the server. + +Completions API +^^^^^^^^^^^^^^^ + +Our Completions API is similar to ``LLM.generate`` but only accepts text. +It is compatible with `OpenAI Completions API `__ +so that you can use OpenAI client to interact with it. +A code example can be found in `examples/openai_completion_client.py `_. + +Chat API +^^^^^^^^ + +Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs `. +It is compatible with `OpenAI Chat Completions API `__ +so that you can use OpenAI client to interact with it. +A code example can be found in `examples/openai_chat_completion_client.py `_. diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst new file mode 100644 index 0000000000000..7fa66274c3c5a --- /dev/null +++ b/docs/source/models/pooling_models.rst @@ -0,0 +1,99 @@ +.. _pooling_models: + +Pooling Models +============== + +vLLM also supports pooling models, including embedding, reranking and reward models. + +In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface. +These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input +before returning them. + +.. note:: + + We currently support pooling models primarily as a matter of convenience. + As shown in the :ref:`Compatibility Matrix `, most vLLM features are not applicable to + pooling models as they only work on the generation or decode stage, so performance may not improve as much. + +Offline Inference +----------------- + +The :class:`~vllm.LLM` class provides various methods for offline inference. +See :ref:`Engine Arguments ` for a list of options when initializing the model. + +For pooling models, we support the following :code:`task` options: + +- Embedding (:code:`"embed"` / :code:`"embedding"`) +- Classification (:code:`"classify"`) +- Sentence Pair Scoring (:code:`"score"`) +- Reward Modeling (:code:`"reward"`) + +The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used: + +- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. +- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. +- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. +- Reward Modeling: Extract all of the hidden states and return them directly. + +When loading `Sentence Transformers `__ models, +we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`). + +You can customize the model's pooling method via the :code:`override_pooler_config` option, +which takes priority over both the model's and Sentence Transformers's defaults. + +``LLM.encode`` +^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM. +It returns the aggregated hidden states directly. + +.. code-block:: python + + llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") + outputs = llm.encode("Hello, my name is") + + outputs = model.encode(prompts) + for output in outputs: + embeddings = output.outputs.embedding + print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}") + +A code example can be found in `examples/offline_inference_embedding.py `_. + +``LLM.score`` +^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs. +It is primarily designed for `cross-encoder models `__. +These types of models serve as rerankers between candidate query-document pairs in RAG systems. + +.. note:: + + vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. + To handle RAG at a higher level, you should use integration frameworks such as `LangChain `_. + +You can use `these tests `_ as reference. + +Online Inference +---------------- + +Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. +Please click on the above link for more details on how to launch the server. + +Embeddings API +^^^^^^^^^^^^^^ + +Our Embeddings API is similar to ``LLM.encode``, accepting both text and :ref:`multi-modal inputs `. + +The text-only API is compatible with `OpenAI Embeddings API `__ +so that you can use OpenAI client to interact with it. +A code example can be found in `examples/openai_embedding_client.py `_. + +The multi-modal API is an extension of the `OpenAI Embeddings API `__ +that incorporates `OpenAI Chat Completions API `__, +so it is not part of the OpenAI standard. Please see :ref:`this page ` for more details on how to use it. + +Score API +^^^^^^^^^ + +Our Score API is similar to ``LLM.score``. +Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 6540e023c1ab0..b9957cf9563b1 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -3,11 +3,21 @@ Supported Models ================ -vLLM supports a variety of generative and embedding models from `HuggingFace (HF) Transformers `_. -This page lists the model architectures that are currently supported by vLLM. +vLLM supports generative and pooling models across various tasks. +If a model supports more than one task, you can set the task via the :code:`--task` argument. + +For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. -For other models, you can check the :code:`config.json` file inside the model repository. +Loading a Model +^^^^^^^^^^^^^^^ + +HuggingFace Hub ++++++++++++++++ + +By default, vLLM loads models from `HuggingFace (HF) Hub `_. + +To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository. If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. .. tip:: @@ -17,38 +27,57 @@ If the :code:`"architectures"` field contains a model architecture listed below, from vllm import LLM - llm = LLM(model=...) # Name or path of your model + # For generative models (task=generate) only + llm = LLM(model=..., task="generate") # Name or path of your model output = llm.generate("Hello, my name is") print(output) - If vLLM successfully generates text, it indicates that your model is supported. + # For pooling models (task={embed,classify,reward}) only + llm = LLM(model=..., task="embed") # Name or path of your model + output = llm.encode("Hello, my name is") + print(output) + + If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` for instructions on how to implement your model in vLLM. Alternatively, you can `open an issue on GitHub `_ to request vLLM support. -.. note:: - To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: +ModelScope +++++++++++ - .. code-block:: shell +To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: - $ export VLLM_USE_MODELSCOPE=True +.. code-block:: shell - And use with :code:`trust_remote_code=True`. + $ export VLLM_USE_MODELSCOPE=True - .. code-block:: python +And use with :code:`trust_remote_code=True`. - from vllm import LLM +.. code-block:: python - llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) + from vllm import LLM + + llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) -Text-only Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^ + # For generative models (task=generate) only + output = llm.generate("Hello, my name is") + print(output) -Text Generation ---------------- + # For pooling models (task={embed,classify,reward}) only + output = llm.encode("Hello, my name is") + print(output) + +List of Text-only Language Models +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Generative Models ++++++++++++++++++ + +See :ref:`this page ` for more information on how to use generative models. + +Text Generation (``--task generate``) +------------------------------------- .. list-table:: :widths: 25 25 50 5 5 @@ -328,8 +357,24 @@ Text Generation .. note:: Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -Text Embedding --------------- +Pooling Models +++++++++++++++ + +See :ref:`this page ` for more information on how to use pooling models. + +.. important:: + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +Text Embedding (``--task embed``) +--------------------------------- + +Any text generation model can be converted into an embedding model by passing :code:`--task embed`. + +.. note:: + To get the best results, you should use pooling models that are specifically trained as such. + +The following table lists those that are tested in vLLM. .. list-table:: :widths: 25 25 50 5 5 @@ -371,13 +416,6 @@ Text Embedding - - -.. important:: - Some model architectures support both generation and embedding tasks. - In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. - -.. tip:: - You can override the model's pooling method by passing :code:`--override-pooler-config`. - .. note:: :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`. @@ -389,8 +427,8 @@ Text Embedding On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention despite being described otherwise on its model card. -Reward Modeling ---------------- +Reward Modeling (``--task reward``) +----------------------------------- .. list-table:: :widths: 25 25 50 5 5 @@ -416,11 +454,8 @@ Reward Modeling For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. -.. note:: - As an interim measure, these models are supported in both offline and online inference via Embeddings API. - -Classification ---------------- +Classification (``--task classify``) +------------------------------------ .. list-table:: :widths: 25 25 50 5 5 @@ -437,11 +472,8 @@ Classification - ✅︎ - ✅︎ -.. note:: - As an interim measure, these models are supported in both offline and online inference via Embeddings API. - -Sentence Pair Scoring ---------------------- +Sentence Pair Scoring (``--task score``) +---------------------------------------- .. list-table:: :widths: 25 25 50 5 5 @@ -468,13 +500,10 @@ Sentence Pair Scoring - - -.. note:: - These models are supported in both offline and online inference via Score API. - .. _supported_mm_models: -Multimodal Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^ +List of Multimodal Language Models +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The following modalities are supported depending on the model: @@ -491,8 +520,15 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive. - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -Text Generation ---------------- +See :ref:`this page ` on how to pass multi-modal inputs to the model. + +Generative Models ++++++++++++++++++ + +See :ref:`this page ` for more information on how to use generative models. + +Text Generation (``--task generate``) +------------------------------------- .. list-table:: :widths: 25 25 15 20 5 5 5 @@ -696,8 +732,24 @@ Text Generation The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 -Multimodal Embedding --------------------- +Pooling Models +++++++++++++++ + +See :ref:`this page ` for more information on how to use pooling models. + +.. important:: + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +Text Embedding (``--task embed``) +--------------------------------- + +Any text generation model can be converted into an embedding model by passing :code:`--task embed`. + +.. note:: + To get the best results, you should use pooling models that are specifically trained as such. + +The following table lists those that are tested in vLLM. .. list-table:: :widths: 25 25 15 25 5 5 @@ -728,12 +780,7 @@ Multimodal Embedding - - ✅︎ -.. important:: - Some model architectures support both generation and embedding tasks. - In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. - -.. tip:: - You can override the model's pooling method by passing :code:`--override-pooler-config`. +---- Model Support Policy ===================== diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst index a93632ff36fb8..04dd72b1e3527 100644 --- a/docs/source/usage/compatibility_matrix.rst +++ b/docs/source/usage/compatibility_matrix.rst @@ -39,13 +39,13 @@ Feature x Feature - :abbr:`prmpt adptr (Prompt Adapter)` - :ref:`SD ` - CUDA graph - - :abbr:`emd (Embedding Models)` + - :abbr:`pooling (Pooling Models)` - :abbr:`enc-dec (Encoder-Decoder Models)` - :abbr:`logP (Logprobs)` - :abbr:`prmpt logP (Prompt Logprobs)` - :abbr:`async output (Async Output Processing)` - multi-step - - :abbr:`mm (Multimodal)` + - :abbr:`mm (Multimodal Inputs)` - best-of - beam-search - :abbr:`guided dec (Guided Decoding)` @@ -151,7 +151,7 @@ Feature x Feature - - - - * - :abbr:`emd (Embedding Models)` + * - :abbr:`pooling (Pooling Models)` - ✗ - ✗ - ✗ @@ -253,7 +253,7 @@ Feature x Feature - - - - * - :abbr:`mm (Multimodal)` + * - :abbr:`mm (Multimodal Inputs)` - ✅ - `✗ `__ - `✗ `__ @@ -386,7 +386,7 @@ Feature x Hardware - ✅ - ✗ - ✅ - * - :abbr:`emd (Embedding Models)` + * - :abbr:`pooling (Pooling Models)` - ✅ - ✅ - ✅ @@ -402,7 +402,7 @@ Feature x Hardware - ✅ - ✅ - ✗ - * - :abbr:`mm (Multimodal)` + * - :abbr:`mm (Multimodal Inputs)` - ✅ - ✅ - ✅ diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py index ae158eef2ca4c..17f6d992073d7 100644 --- a/examples/offline_inference_embedding.py +++ b/examples/offline_inference_embedding.py @@ -9,7 +9,12 @@ ] # Create an LLM. -model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) +model = LLM( + model="intfloat/e5-mistral-7b-instruct", + task="embed", # You should pass task="embed" for embedding models + enforce_eager=True, +) + # Generate embedding. The output is a list of PoolingRequestOutputs. outputs = model.encode(prompts) # Print the outputs. diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py index e1732d045f949..bf466109f0981 100644 --- a/examples/offline_inference_vision_language_embedding.py +++ b/examples/offline_inference_vision_language_embedding.py @@ -59,7 +59,7 @@ def run_e5_v(query: Query): llm = LLM( model="royokong/e5-v", - task="embedding", + task="embed", max_model_len=4096, ) @@ -88,7 +88,7 @@ def run_vlm2vec(query: Query): llm = LLM( model="TIGER-Lab/VLM2Vec-Full", - task="embedding", + task="embed", trust_remote_code=True, mm_processor_kwargs={"num_crops": 4}, ) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 99781c55b672e..87d5aefea6cb4 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -55,7 +55,7 @@ class TestSetting: # embedding model TestSetting( model="BAAI/bge-multilingual-gemma2", - model_args=["--task", "embedding"], + model_args=["--task", "embed"], pp_size=1, tp_size=1, attn_backend="FLASHINFER", @@ -65,7 +65,7 @@ class TestSetting: # encoder-based embedding model (BERT) TestSetting( model="BAAI/bge-base-en-v1.5", - model_args=["--task", "embedding"], + model_args=["--task", "embed"], pp_size=1, tp_size=1, attn_backend="XFORMERS", diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index 7cd0416d321ef..16bea54936bc8 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -37,7 +37,7 @@ def test_scheduler_schedule_simple_encoder_decoder(): num_seq_group = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - task="generate", + "generate", max_num_batched_tokens=64, max_num_seqs=num_seq_group, max_model_len=max_model_len, diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 425f2a10ec855..43c63daacb17f 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -27,7 +27,7 @@ def server(): args = [ "--task", - "embedding", + "embed", "--dtype", "bfloat16", "--max-model-len", diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 5ef8540265d14..f458ef5ef556d 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -54,7 +54,7 @@ def test_models( hf_outputs = hf_model.encode(example_prompts) with vllm_runner(model, - task="embedding", + task="embed", dtype=dtype, max_model_len=None, **vllm_extra_kwargs) as vllm_model: diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index 30fa5ea7b36c0..0c3115d195fc1 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -35,9 +35,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict([text_pair]).tolist() - with vllm_runner(model_name, - task="embedding", - dtype=dtype, + with vllm_runner(model_name, task="score", dtype=dtype, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) @@ -58,9 +56,7 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict(text_pairs).tolist() - with vllm_runner(model_name, - task="embedding", - dtype=dtype, + with vllm_runner(model_name, task="score", dtype=dtype, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) @@ -82,9 +78,7 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict(text_pairs).tolist() - with vllm_runner(model_name, - task="embedding", - dtype=dtype, + with vllm_runner(model_name, task="score", dtype=dtype, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py index 3dd8cb729f8a6..2641987b25a3a 100644 --- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -93,7 +93,7 @@ def _run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). with vllm_runner(model, - task="embedding", + task="embed", dtype=dtype, enforce_eager=True, max_model_len=8192) as vllm_model: diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 693abd7252d5e..f4cd8b81a0d7d 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -47,7 +47,7 @@ def _run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). with vllm_runner(model, - task="embedding", + task="embed", dtype=dtype, max_model_len=4096, enforce_eager=True) as vllm_model: diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index 6145aff1a5ea2..9374c23dd6ffe 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -39,7 +39,7 @@ def _run_test( # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, task="embedding", dtype=dtype, + with vllm_runner(model, task="embed", dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.encode(input_texts, images=input_images) diff --git a/tests/test_config.py b/tests/test_config.py index 45b0b938af215..4518adfc31bfc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -7,11 +7,17 @@ from vllm.platforms import current_platform -@pytest.mark.parametrize(("model_id", "expected_task"), [ - ("facebook/opt-125m", "generate"), - ("intfloat/e5-mistral-7b-instruct", "embedding"), -]) -def test_auto_task(model_id, expected_task): +@pytest.mark.parametrize( + ("model_id", "expected_runner_type", "expected_task"), + [ + ("facebook/opt-125m", "generate", "generate"), + ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"), + ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"), + ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"), + ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"), + ], +) +def test_auto_task(model_id, expected_runner_type, expected_task): config = ModelConfig( model_id, task="auto", @@ -22,6 +28,7 @@ def test_auto_task(model_id, expected_task): dtype="float16", ) + assert config.runner_type == expected_runner_type assert config.task == expected_task diff --git a/vllm/config.py b/vllm/config.py index 2a9f0ebae997d..2d9a76fe7ddb1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -45,13 +45,27 @@ logger = init_logger(__name__) -_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 +_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 -TaskOption = Literal["auto", "generate", "embedding"] +TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", + "score", "reward"] -# "draft" is only used internally for speculative decoding -_Task = Literal["generate", "embedding", "draft"] +_ResolvedTask = Literal["generate", "embed", "classify", "score", "reward", + "draft"] + +RunnerType = Literal["generate", "pooling", "draft"] + +_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = { + "generate": ["generate"], + "pooling": ["embed", "classify", "score", "reward"], + "draft": ["draft"], +} + +_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { + task: runner + for runner, tasks in _RUNNER_TASKS.items() for task in tasks +} HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]] @@ -144,7 +158,7 @@ class ModelConfig: def __init__( self, model: str, - task: Union[TaskOption, _Task], + task: Union[TaskOption, Literal["draft"]], tokenizer: str, tokenizer_mode: str, trust_remote_code: bool, @@ -295,6 +309,7 @@ def __init__( supported_tasks, task = self._resolve_task(task, self.hf_config) self.supported_tasks = supported_tasks self.task: Final = task + self.pooler_config = self._init_pooler_config(override_pooler_config) self._verify_quantization() @@ -323,7 +338,7 @@ def _init_pooler_config( override_pooler_config: Optional["PoolerConfig"], ) -> Optional["PoolerConfig"]: - if self.task == "embedding": + if self.runner_type == "pooling": user_config = override_pooler_config or PoolerConfig() base_config = get_pooling_config(self.model, self.revision) @@ -357,60 +372,90 @@ def _verify_tokenizer_mode(self) -> None: "either 'auto', 'slow' or 'mistral'.") self.tokenizer_mode = tokenizer_mode + def _get_preferred_task( + self, + architectures: List[str], + supported_tasks: Set[_ResolvedTask], + ) -> Optional[_ResolvedTask]: + model_id = self.model + if get_pooling_config(model_id, self.revision): + return "embed" + if ModelRegistry.is_cross_encoder_model(architectures): + return "score" + + suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [ + # Other models follow this pattern + ("ForCausalLM", "generate"), + ("ForConditionalGeneration", "generate"), + ("ForSequenceClassification", "classify"), + ("ChatModel", "generate"), + ("LMHeadModel", "generate"), + ("EmbeddingModel", "embed"), + ("RewardModel", "reward"), + ] + _, arch = ModelRegistry.inspect_model_cls(architectures) + + for suffix, pref_task in suffix_to_preferred_task: + if arch.endswith(suffix) and pref_task in supported_tasks: + return pref_task + + return None + def _resolve_task( self, - task_option: Union[TaskOption, _Task], + task_option: Union[TaskOption, Literal["draft"]], hf_config: PretrainedConfig, - ) -> Tuple[Set[_Task], _Task]: + ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]: if task_option == "draft": return {"draft"}, "draft" architectures = getattr(hf_config, "architectures", []) - task_support: Dict[_Task, bool] = { + runner_support: Dict[RunnerType, bool] = { # NOTE: Listed from highest to lowest priority, # in case the model supports multiple of them "generate": ModelRegistry.is_text_generation_model(architectures), - "embedding": ModelRegistry.is_pooling_model(architectures), + "pooling": ModelRegistry.is_pooling_model(architectures), } - supported_tasks_lst: List[_Task] = [ - task for task, is_supported in task_support.items() if is_supported + supported_runner_types_lst: List[RunnerType] = [ + runner_type + for runner_type, is_supported in runner_support.items() + if is_supported + ] + + supported_tasks_lst: List[_ResolvedTask] = [ + task for runner_type in supported_runner_types_lst + for task in _RUNNER_TASKS[runner_type] ] supported_tasks = set(supported_tasks_lst) if task_option == "auto": selected_task = next(iter(supported_tasks_lst)) - if len(supported_tasks) > 1: - suffix_to_preferred_task: List[Tuple[str, _Task]] = [ - # Hardcode the models that are exceptions - ("AquilaModel", "generate"), - ("ChatGLMModel", "generate"), - # Other models follow this pattern - ("ForCausalLM", "generate"), - ("ForConditionalGeneration", "generate"), - ("ChatModel", "generate"), - ("LMHeadModel", "generate"), - ("EmbeddingModel", "embedding"), - ("RewardModel", "embedding"), - ("ForSequenceClassification", "embedding"), - ] - info, arch = ModelRegistry.inspect_model_cls(architectures) - - for suffix, pref_task in suffix_to_preferred_task: - if arch.endswith(suffix) and pref_task in supported_tasks: - selected_task = pref_task - break - else: - if (arch.endswith("Model") - and info.architecture.endswith("ForCausalLM") - and "embedding" in supported_tasks): - selected_task = "embedding" + if len(supported_tasks_lst) > 1: + preferred_task = self._get_preferred_task( + architectures, supported_tasks) + if preferred_task is not None: + selected_task = preferred_task logger.info( "This model supports multiple tasks: %s. " "Defaulting to '%s'.", supported_tasks, selected_task) else: + # Aliases + if task_option == "embedding": + preferred_task = self._get_preferred_task( + architectures, supported_tasks) + if preferred_task != "embed": + msg = ("The 'embedding' task will be restricted to " + "embedding models in a future release. Please " + "pass `--task classify`, `--task score`, or " + "`--task reward` explicitly for other pooling " + "models.") + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + task_option = preferred_task or "embed" + if task_option not in supported_tasks: msg = ( f"This model does not support the '{task_option}' task. " @@ -533,7 +578,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, # Async postprocessor is not necessary with embedding mode # since there is no token generation - if self.task == "embedding": + if self.runner_type == "pooling": self.use_async_output_proc = False # Reminder: Please update docs/source/usage/compatibility_matrix.rst @@ -750,6 +795,14 @@ def is_cross_encoder(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) return ModelRegistry.is_cross_encoder_model(architectures) + @property + def supported_runner_types(self) -> Set[RunnerType]: + return {_TASK_RUNNER[task] for task in self.supported_tasks} + + @property + def runner_type(self) -> RunnerType: + return _TASK_RUNNER[self.task] + class CacheConfig: """Configuration for the KV cache. @@ -1096,7 +1149,7 @@ def _verify_args(self) -> None: class SchedulerConfig: """Scheduler configuration.""" - task: str = "generate" # The task to use the model for. + runner_type: str = "generate" # The runner type to launch for the model. # Maximum number of tokens to be processed in a single iteration. max_num_batched_tokens: int = field(default=None) # type: ignore @@ -1164,11 +1217,11 @@ def __post_init__(self) -> None: # for higher throughput. self.max_num_batched_tokens = max(self.max_model_len, 2048) - if self.task == "embedding": - # For embedding, choose specific value for higher throughput + if self.runner_type == "pooling": + # Choose specific value for higher throughput self.max_num_batched_tokens = max( self.max_num_batched_tokens, - _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS, + _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, ) if self.is_multimodal_model: # The value needs to be at least the number of multimodal tokens diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 94c62743883ec..c3bc6becf0995 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -337,7 +337,7 @@ def __init__( self.lora_config = lora_config version = "selfattn" - if (self.scheduler_config.task == "embedding" + if (self.scheduler_config.runner_type == "pooling" or self.cache_config.is_attention_free): version = "placeholder" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7b9adc401abcf..d485c2a9e7208 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1066,7 +1066,7 @@ def create_engine_config(self, if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and model_config.task != "embedding"): + and model_config.runner_type != "pooling"): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " @@ -1083,7 +1083,8 @@ def create_engine_config(self, "errors during the initial memory profiling phase, or result " "in low performance due to small KV cache space. Consider " "setting --max-model-len to a smaller value.", max_model_len) - elif self.enable_chunked_prefill and model_config.task == "embedding": + elif (self.enable_chunked_prefill + and model_config.runner_type == "pooling"): msg = "Chunked prefill is not supported for embedding models" raise ValueError(msg) @@ -1144,7 +1145,7 @@ def create_engine_config(self, " please file an issue with detailed information.") scheduler_config = SchedulerConfig( - task=model_config.task, + runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6eca304b45f07..9be30c635cb2c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -288,7 +288,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: self.model_executor = executor_class(vllm_config=vllm_config, ) - if self.model_config.task != "embedding": + if self.model_config.runner_type != "pooling": self._initialize_kv_caches() # If usage stat is enabled, collect relevant info. @@ -1123,7 +1123,7 @@ def _process_model_outputs(self, seq_group.metrics.model_execute_time = ( o.model_execute_time) - if self.model_config.task == "embedding": + if self.model_config.runner_type == "pooling": self._process_sequence_group_outputs(seq_group, output) else: self.output_processor.process_prompt_logprob(seq_group, output) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2a02187223a33..0bec978c4869c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -381,19 +381,20 @@ def generate( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - task = self.llm_engine.model_config.task - if task != "generate": + runner_type = self.llm_engine.model_config.runner_type + if runner_type != "generate": messages = [ "LLM.generate() is only supported for (conditional) generation " "models (XForCausalLM, XForConditionalGeneration).", ] - supported_tasks = self.llm_engine.model_config.supported_tasks - if "generate" in supported_tasks: + supported_runner_types = self.llm_engine.model_config \ + .supported_runner_types + if "generate" in supported_runner_types: messages.append( - "Your model supports the 'generate' task, but is " - f"currently initialized for the '{task}' task. Please " - "initialize the model using `--task generate`.") + "Your model supports the 'generate' runner, but is " + f"currently initialized for the '{runner_type}' runner. " + "Please initialize vLLM using `--task generate`.") raise ValueError(" ".join(messages)) @@ -793,16 +794,18 @@ def encode( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - task = self.llm_engine.model_config.task - if task != "embedding": - messages = ["LLM.encode() is only supported for embedding models."] + runner_type = self.llm_engine.model_config.runner_type + if runner_type != "pooling": + messages = ["LLM.encode() is only supported for pooling models."] - supported_tasks = self.llm_engine.model_config.supported_tasks - if "embedding" in supported_tasks: + supported_runner_types = self.llm_engine.model_config \ + .supported_runner_types + if "pooling" in supported_runner_types: messages.append( - "Your model supports the 'embedding' task, but is " - f"currently initialized for the '{task}' task. Please " - "initialize the model using `--task embedding`.") + "Your model supports the 'pooling' runner, but is " + f"currently initialized for the '{runner_type}' runner. " + "Please initialize vLLM using `--task embed`, " + "`--task classify`, `--task score` etc.") raise ValueError(" ".join(messages)) @@ -864,21 +867,23 @@ def score( A list of ``PoolingRequestOutput`` objects containing the generated scores in the same order as the input prompts. """ - task = self.llm_engine.model_config.task - if task != "embedding": - messages = ["LLM.score() is only supported for embedding models."] + runner_type = self.llm_engine.model_config.runner_type + if runner_type != "pooling": + messages = ["LLM.score() is only supported for pooling models."] - supported_tasks = self.llm_engine.model_config.supported_tasks - if "embedding" in supported_tasks: + supported_runner_types = self.llm_engine.model_config \ + .supported_runner_types + if "pooling" in supported_runner_types: messages.append( - "Your model supports the 'embedding' task, but is " - f"currently initialized for the '{task}' task. Please " - "initialize the model using `--task embedding`.") + "Your model supports the 'pooling' runner, but is " + f"currently initialized for the '{runner_type}' runner. " + "Please initialize vLLM using `--task embed`, " + "`--task classify`, `--task score` etc.") raise ValueError(" ".join(messages)) if not self.llm_engine.model_config.is_cross_encoder: - raise ValueError("Your model does not support the cross encoding") + raise ValueError("Your model does not support cross encoding") tokenizer = self.llm_engine.get_tokenizer() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0f93eb54111ad..a345f8caeeed2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -573,7 +573,7 @@ def init_app_state( enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, - ) if model_config.task == "generate" else None + ) if model_config.runner_type == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, @@ -582,7 +582,7 @@ def init_app_state( prompt_adapters=args.prompt_adapters, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, - ) if model_config.task == "generate" else None + ) if model_config.runner_type == "generate" else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, @@ -590,13 +590,13 @@ def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if model_config.task == "embedding" else None + ) if model_config.runner_type == "pooling" else None state.openai_serving_scores = OpenAIServingScores( engine_client, model_config, base_model_paths, request_logger=request_logger - ) if (model_config.task == "embedding" \ + ) if (model_config.runner_type == "pooling" \ and model_config.is_cross_encoder) else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 00cdb3b6839f5..675daf54c0d0d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -224,7 +224,7 @@ async def main(args): chat_template=None, chat_template_content_format="auto", enable_prompt_tokens_details=args.enable_prompt_tokens_details, - ) if model_config.task == "generate" else None + ) if model_config.runner_type == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( engine, model_config, @@ -232,7 +232,7 @@ async def main(args): request_logger=request_logger, chat_template=None, chat_template_content_format="auto", - ) if model_config.task == "embedding" else None + ) if model_config.runner_type == "pooling" else None tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index cfb89e0f336bc..f15e7176b3d50 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -35,7 +35,7 @@ def get_model_architecture( architectures = ["QuantMixtralForCausalLM"] model_cls, arch = ModelRegistry.resolve_model_cls(architectures) - if model_config.task == "embedding": + if model_config.runner_type == "pooling": model_cls = as_embedding_model(model_cls) return model_cls, arch diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index fdb241e6753fb..55a5c4dff3a5c 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -42,7 +42,7 @@ def __init__( executor_class: Type[Executor], usage_context: UsageContext, ): - assert vllm_config.model_config.task != "embedding" + assert vllm_config.model_config.runner_type != "pooling" logger.info("Initializing an LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 4fad1a3f4caeb..ba3d4a130a80b 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -163,7 +163,7 @@ def __init__( not in ["medusa", "mlp_speculator", "eagle"]) \ else {"return_hidden_states": True} ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner - if self.model_config.task == "embedding": + if self.model_config.runner_type == "pooling": ModelRunnerClass = CPUPoolingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 094dd5a5d08b3..832b9903b7abc 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -75,7 +75,7 @@ def __init__( else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner - if model_config.task == "embedding": + if model_config.runner_type == "pooling": ModelRunnerClass = PoolingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = EncoderDecoderModelRunner From cad5c0a6eda057eeece87a42fff49fef3e18a2ac Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Dec 2024 21:36:27 +0800 Subject: [PATCH 07/87] [Doc] Update docs to refer to pooling models (#11093) Signed-off-by: DarkLight1337 --- docs/source/usage/faq.rst | 7 ++++++- vllm/attention/backends/placeholder_attn.py | 2 +- vllm/config.py | 8 ++++---- vllm/core/placeholder_block_space_manager.py | 2 +- vllm/engine/arg_utils.py | 4 ++-- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/multiprocessing/client.py | 2 +- vllm/engine/protocol.py | 2 +- vllm/entrypoints/openai/serving_score.py | 2 +- vllm/sequence.py | 6 +++--- vllm/v1/engine/processor.py | 2 +- vllm/worker/cpu_worker.py | 2 +- vllm/worker/hpu_worker.py | 4 ++-- vllm/worker/worker.py | 2 +- 14 files changed, 26 insertions(+), 21 deletions(-) diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.rst index ce327abd5fa20..d88da32092924 100644 --- a/docs/source/usage/faq.rst +++ b/docs/source/usage/faq.rst @@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul Q: Which model to use for offline inference embedding? -A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model +A: You can try `e5-mistral-7b-instruct `__ and `BAAI/bge-base-en-v1.5 `__; +more are listed :ref:`here `. + +By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B `__, +`Mistral-7B-Instruct-v0.3 `__ into embedding models, +but they are expected be inferior to models that are specifically trained on embedding tasks. ---------------------------------------- diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 658039bfc3365..534f79b3a60bf 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -14,7 +14,7 @@ from vllm.worker.model_runner import (ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata) -# Placeholder attention backend for models like Mamba and embedding models that +# Placeholder attention backend for models like Mamba and pooling models that # lack attention. diff --git a/vllm/config.py b/vllm/config.py index 2d9a76fe7ddb1..322c8f8990a40 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -152,7 +152,7 @@ class ModelConfig: this argument will be used to configure the neuron config that can not be gathered from the vllm arguments. override_pooler_config: Initialize non default pooling config or - override default pooling config for the embedding model. + override default pooling config for the pooling model. """ def __init__( @@ -576,7 +576,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Async postprocessor is not necessary with embedding mode + # Async postprocessor is not necessary for pooling models # since there is no token generation if self.runner_type == "pooling": self.use_async_output_proc = False @@ -1825,11 +1825,11 @@ class MultiModalConfig: @dataclass class PoolerConfig: - """Controls the behavior of output pooling in embedding models.""" + """Controls the behavior of output pooling in pooling models.""" pooling_type: Optional[str] = None """ - The pooling method of the embedding model. This should be a key in + The pooling method of the pooling model. This should be a key in :class:`vllm.model_executor.layers.pooler.PoolingType`. """ diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index 26d42b7f1790e..a47e594518534 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -8,7 +8,7 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager): """A version of BlockSpaceManager for use in environments where block management is not required. - For example: embedding models or attention-free models like Mamba. + For example: pooling models or attention-free models like Mamba. This class provides the same interface as BlockSpaceManager, but its methods perform no actions or return simple values like True in specific diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d485c2a9e7208..7337522bc9952 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -893,7 +893,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--override-pooler-config', type=PoolerConfig.from_json, default=None, - help="Override or set the pooling method in the embedding model. " + help="Override or set the pooling method for pooling models. " "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'") parser.add_argument('--compilation-config', @@ -1085,7 +1085,7 @@ def create_engine_config(self, "setting --max-model-len to a smaller value.", max_model_len) elif (self.enable_chunked_prefill and model_config.runner_type == "pooling"): - msg = "Chunked prefill is not supported for embedding models" + msg = "Chunked prefill is not supported for pooling models" raise ValueError(msg) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 60dccd7a0812c..32396fd10188d 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1085,7 +1085,7 @@ async def encode( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model. + """Generate outputs for a request from a pooling model. Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index a729023bc00bb..0a046c71e86e8 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -527,7 +527,7 @@ def encode( *, inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model. + """Generate outputs for a request from a pooling model. Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 4079de7d36793..a066836b92708 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -209,7 +209,7 @@ def encode( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model.""" + """Generate outputs for a request from a pooling model.""" ... @abstractmethod diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index fed06fa452955..4929e720c00e4 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -119,7 +119,7 @@ async def create_score( if prompt_adapter_request is not None: raise NotImplementedError("Prompt adapter is not supported " - "for embedding models") + "for scoring models") if isinstance(tokenizer, MistralTokenizer): raise ValueError( diff --git a/vllm/sequence.py b/vllm/sequence.py index 669124319c4f4..b0f3c1cc3609f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -618,9 +618,9 @@ class SequenceGroup: arrival_time: The arrival time of the request. lora_request: LoRA request. embeddings: The embeddings vectors of the prompt of the sequence group - for an embedding model. + for a pooling model. pooling_params: The pooling parameters used to generate the pooling - for an embedding model. + for a pooling model. encoder_seq: Optional, the single encoder sequence. Should be None unless you are working with an encoder/decoder model. trace_headers: OpenTelemetry trace headers. @@ -1102,7 +1102,7 @@ class PoolerOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] - """The output from a pooling operation in the embedding model.""" + """The output from a pooling operation in the pooling model.""" outputs: List[EmbeddingSequenceGroupOutput] # lazy import to avoid circular import diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 120fc64969552..e0e525b30a767 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -59,7 +59,7 @@ def process_inputs( priority: int = 0, ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: - # TODO(woosuk): Support embedding mode. + # TODO(woosuk): Support pooling models. # TODO(woosuk): Check max_logprobs # TODO(woosuk): Support encoder-decoder models. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index ba3d4a130a80b..09758a5d9accf 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -178,7 +178,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CPUCacheEngine] - # Initialize cpu_cache as embedding models don't initialize kv_caches + # Initialize cpu_cache as pooling models don't initialize kv_caches self.cpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 493f7a9fad098..cca7cd50bfc7b 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -65,8 +65,8 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[HPUCacheEngine] - # Initialize gpu_cache as embedding models don't initialize kv_caches - self.hpu_cache: Optional[List[List[torch.tensor]]] = None + # Initialize gpu_cache as pooling models don't initialize kv_caches + self.hpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 832b9903b7abc..a368bb9ee9a5b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -91,7 +91,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] - # Initialize gpu_cache as embedding models don't initialize kv_caches + # Initialize gpu_cache as pooling models don't initialize kv_caches self.gpu_cache: Optional[List[List[torch.Tensor]]] = None self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} From b2f775456e4af7412308320a9c11e4dac3086205 Mon Sep 17 00:00:00 2001 From: hissu-hyvarinen Date: Wed, 11 Dec 2024 17:23:37 +0200 Subject: [PATCH 08/87] [CI/Build] Enable prefix caching test for AMD (#11098) Signed-off-by: Hissu Hyvarinen --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8f57006214c88..df4fa7a6ee9ba 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -201,7 +201,7 @@ steps: - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min - #mirror_hardwares: [amd] + mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/prefix_caching From fd22220687af5ccd89d9f8f2812069ef0422244c Mon Sep 17 00:00:00 2001 From: bingps <46775742+bingps@users.noreply.github.com> Date: Wed, 11 Dec 2024 23:43:24 +0800 Subject: [PATCH 09/87] [Doc] Installed version of llmcompressor for int8/fp8 quantization (#11103) Signed-off-by: Guangda Liu Co-authored-by: Guangda Liu --- docs/source/quantization/fp8.rst | 2 +- docs/source/quantization/int8.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst index aacd07a34ad46..4dbf8e9d346e1 100644 --- a/docs/source/quantization/fp8.rst +++ b/docs/source/quantization/fp8.rst @@ -45,7 +45,7 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the .. code-block:: console - $ pip install llmcompressor==0.1.0 + $ pip install llmcompressor Quantization Process -------------------- diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst index 04fa308449507..aa5b251becb1c 100644 --- a/docs/source/quantization/int8.rst +++ b/docs/source/quantization/int8.rst @@ -19,7 +19,7 @@ To use INT8 quantization with vLLM, you'll need to install the `llm-compressor < .. code-block:: console - $ pip install llmcompressor==0.1.0 + $ pip install llmcompressor Quantization Process -------------------- @@ -142,4 +142,4 @@ Best Practices Troubleshooting and Support --------------------------- -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. \ No newline at end of file +If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. From 91642db952458fbb6ae7c2d167757dc86b105991 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 11 Dec 2024 10:43:05 -0800 Subject: [PATCH 10/87] [torch.compile] use depyf to dump torch.compile internals (#10972) Signed-off-by: youkaichao --- requirements-common.txt | 1 + vllm/compilation/backends.py | 69 ++++++++++++++++++---------------- vllm/compilation/decorators.py | 2 +- vllm/compilation/monitor.py | 23 ++++++++++-- vllm/compilation/wrapper.py | 4 +- vllm/config.py | 6 ++- vllm/worker/model_runner.py | 3 +- 7 files changed, 66 insertions(+), 42 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 792cd58e80669..850b8f4101701 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -33,3 +33,4 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. compressed-tensors == 0.8.0 # required for compressed-tensors +depyf==0.18.0 # required for profiling and debugging torch.compile diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index f002a8ff905b1..09a3daa731829 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -9,7 +9,7 @@ import torch.fx as fx import vllm.envs as envs -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, VllmConfig from vllm.logger import init_logger from vllm.utils import weak_ref_tensors @@ -149,14 +149,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): """ def __init__(self, module: torch.fx.GraphModule, - compile_submod_names: List[str], - compilation_configs: CompilationConfig, graph_pool): + compile_submod_names: List[str], vllm_config: VllmConfig, + graph_pool): super().__init__(module) from torch._guards import detect_fake_mode self.fake_mode = detect_fake_mode() self.compile_submod_names = compile_submod_names - self.compilation_configs = compilation_configs + self.compilation_config = vllm_config.compilation_config self.graph_pool = graph_pool + self.vllm_config = vllm_config def run(self, *args): fake_args = [ @@ -182,15 +183,15 @@ def call_module(self, target: torch.fx.node.Target, compiled_graph_for_general_shape = wrap_inductor( submod, args, - self.compilation_configs.inductor_compile_config, - self.compilation_configs, + self.compilation_config.inductor_compile_config, + self.compilation_config, graph_index=index, num_graphs=len(self.compile_submod_names), runtime_shape=None, - use_inductor=self.compilation_configs.use_inductor) + use_inductor=self.compilation_config.use_inductor) self.module.__dict__[target] = PiecewiseBackend( - submod, self.compilation_configs, self.graph_pool, index, + submod, self.vllm_config, self.graph_pool, index, len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_general_shape) @@ -211,7 +212,8 @@ class VllmBackend: which handles the post-grad passes. """ - compilation_configs: CompilationConfig + vllm_config: VllmConfig + compilation_config: CompilationConfig graph_pool: Any _called: bool = False # the graph we compiled @@ -227,7 +229,7 @@ class VllmBackend: def __init__( self, - compilation_configs: CompilationConfig, + vllm_config: VllmConfig, ): global global_graph_pool if global_graph_pool is None: @@ -244,13 +246,14 @@ def __init__( self.sym_tensor_indices = [] self.input_buffers = [] - self.compilation_configs = compilation_configs + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config # `torch.compile` is JIT compiled, so we don't need to # do anything here def configure_post_pass(self): - config = self.compilation_configs + config = self.compilation_config self.post_grad_pass_manager.configure(config.pass_config) # Post-grad custom passes are run using the post_grad_custom_post_pass @@ -271,7 +274,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: from .monitor import torch_compile_start_time dynamo_time = time.time() - torch_compile_start_time logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time) - self.compilation_configs.compilation_time += dynamo_time + self.compilation_config.compilation_time += dynamo_time # we control the compilation process, each instance can only be # called once @@ -281,7 +284,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self.configure_post_pass() self.split_gm, self.piecewise_graphs = split_graph( - graph, self.compilation_configs.splitting_ops) + graph, self.compilation_config.splitting_ops) from torch._dynamo.utils import lazy_format_graph_code logger.debug("%s", lazy_format_graph_code("before split", self.graph)) @@ -298,13 +301,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # propagate the split graph to the piecewise backend, # compile submodules with symbolic shapes PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile, - self.compilation_configs, + self.vllm_config, self.graph_pool).run(*example_inputs) self._called = True - if not self.compilation_configs.use_cudagraph or \ - not self.compilation_configs.cudagraph_copy_inputs: + if not self.compilation_config.use_cudagraph or \ + not self.compilation_config.cudagraph_copy_inputs: return self.split_gm # if we need to copy input buffers for cudagraph @@ -364,10 +367,9 @@ class ConcreteSizeEntry: class PiecewiseBackend: - def __init__(self, graph: fx.GraphModule, - compilation_configs: CompilationConfig, graph_pool: Any, - piecewise_compile_index: int, total_piecewise_compiles: int, - sym_shape_indices: List[int], + def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, + graph_pool: Any, piecewise_compile_index: int, + total_piecewise_compiles: int, sym_shape_indices: List[int], compiled_graph_for_general_shape: Callable): """ The backend for piecewise compilation. @@ -375,7 +377,7 @@ def __init__(self, graph: fx.GraphModule, We will compile `self.graph` once for the general shape, and then compile for different shapes specified in - `compilation_configs.compile_sizes`. + `compilation_config.compile_sizes`. Independently, we will capture cudagraph for different shapes. @@ -383,7 +385,8 @@ def __init__(self, graph: fx.GraphModule, compile it first, and then capture cudagraph. """ self.graph = graph - self.compilation_configs = compilation_configs + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config self.graph_pool = graph_pool self.piecewise_compile_index = piecewise_compile_index self.total_piecewise_compiles = total_piecewise_compiles @@ -393,10 +396,10 @@ def __init__(self, graph: fx.GraphModule, piecewise_compile_index == total_piecewise_compiles - 1) self.compile_sizes: Set[int] = set( - self.compilation_configs.compile_sizes) + self.compilation_config.compile_sizes) self.capture_sizes: Set[int] = set( - self.compilation_configs.capture_sizes - ) if self.compilation_configs.use_cudagraph else set() + self.compilation_config.capture_sizes + ) if self.compilation_config.use_cudagraph else set() self.first_run_finished = False @@ -423,7 +426,7 @@ def __call__(self, *args) -> Any: self.first_run_finished = True # no specific sizes to compile if self.is_last_graph and not self.to_be_compiled_sizes: - end_monitoring_torch_compile(self.compilation_configs) + end_monitoring_torch_compile(self.vllm_config) return self.compiled_graph_for_general_shape(*args) runtime_shape = args[self.sym_shape_indices[0]] @@ -443,28 +446,28 @@ def __call__(self, *args) -> Any: entry.runnable = wrap_inductor( self.graph, args, - self.compilation_configs.inductor_compile_config, - self.compilation_configs, + self.compilation_config.inductor_compile_config, + self.compilation_config, graph_index=self.piecewise_compile_index, num_graphs=self.total_piecewise_compiles, runtime_shape=runtime_shape, - use_inductor=self.compilation_configs.use_inductor) + use_inductor=self.compilation_config.use_inductor) # finished compilations for all required shapes if self.is_last_graph and not self.to_be_compiled_sizes: - end_monitoring_torch_compile(self.compilation_configs) + end_monitoring_torch_compile(self.vllm_config) if not entry.use_cudagraph: return entry.runnable(*args) if entry.cudagraph is None: - if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups: # noqa + if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups: # noqa entry.num_finished_warmup += 1 if self.is_first_graph: logger.debug( "Warming up %s/%s for shape %s", entry.num_finished_warmup, - self.compilation_configs.cudagraph_num_of_warmups, + self.compilation_config.cudagraph_num_of_warmups, runtime_shape) return entry.runnable(*args) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 938430fe2a501..805a217ee6ca1 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -185,7 +185,7 @@ def __call__(self, *args, **kwargs): "Unsupported dynamic dimensions" f" {dims} for argument {k} with type {type(arg)}.") # here, it is the starting point of the `torch.compile` process - start_monitoring_torch_compile(self.vllm_config.compilation_config) + start_monitoring_torch_compile(self.vllm_config) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 3348674b09af2..b97e40415b41b 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,19 +1,36 @@ +import os import time -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig, CompilationLevel, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) +context_manager = None torch_compile_start_time: float = 0.0 -def start_monitoring_torch_compile(compilation_config: CompilationConfig): +def start_monitoring_torch_compile(vllm_config: VllmConfig): global torch_compile_start_time torch_compile_start_time = time.time() + compilation_config: CompilationConfig = vllm_config.compilation_config + if compilation_config.level == CompilationLevel.PIECEWISE and \ + compilation_config.debug_dump_path: + import depyf + path = os.path.join(compilation_config.debug_dump_path, + f"rank_{vllm_config.parallel_config.rank}") + global context_manager + context_manager = depyf.prepare_debug(path) + context_manager.__enter__() -def end_monitoring_torch_compile(compilation_config: CompilationConfig): + +def end_monitoring_torch_compile(vllm_config: VllmConfig): + compilation_config: CompilationConfig = vllm_config.compilation_config if compilation_config.level == CompilationLevel.PIECEWISE: logger.info("torch.compile takes %.2f s in total", compilation_config.compilation_time) + global context_manager + if context_manager is not None: + context_manager.__exit__(None, None, None) + context_manager = None diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index bc4d292fef402..c10241b483169 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -32,8 +32,8 @@ def __init__(self, # default compilation settings # compiling the forward method - backend = get_current_vllm_config( - ).compilation_config.init_backend() + vllm_config = get_current_vllm_config() + backend = vllm_config.compilation_config.init_backend(vllm_config) compiled_callable = torch.compile( self.forward, diff --git a/vllm/config.py b/vllm/config.py index 322c8f8990a40..7f9be5a3a98bc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2222,6 +2222,7 @@ class CompilationConfig(BaseModel): - 1: dynamo as is. - 2: dynamo once. - 3: piecewise compilation. + - debug_dump_path: the path to dump the debug information. - backend: the backend for compilation. It needs to be a string. - "" (empty string): use the default backend. - "eager"/"openxla"/...: use the specified backend registered in PyTorch. @@ -2289,6 +2290,7 @@ class CompilationConfig(BaseModel): certain small batchsizes, where inductor is good at optimizing. """ # noqa level: int = 0 + debug_dump_path: str = "" backend: str = "" custom_ops: List[str] = Field(default_factory=list) splitting_ops: List[str] = Field(default_factory=lambda: [ @@ -2394,7 +2396,7 @@ def model_post_init(self, __context: Any) -> None: self.static_forward_context = {} self.compilation_time = 0.0 - def init_backend(self) -> Union[str, Callable]: + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -2413,7 +2415,7 @@ def init_backend(self) -> Union[str, Callable]: # merge with the config use_inductor assert self.level == CompilationLevel.PIECEWISE from vllm.compilation.backends import VllmBackend - return VllmBackend(self) + return VllmBackend(vllm_config) def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]): """To complete the initialization of config, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 551b84435fdc0..26fd486130ce6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1162,7 +1162,8 @@ def load_model(self) -> None: if self.vllm_config.compilation_config.level ==\ CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): - backend = self.vllm_config.compilation_config.init_backend() + backend = self.vllm_config.compilation_config.init_backend( + self.vllm_config) self.model = torch.compile( self.model, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, From d643c2aba1cd5421200f3a3bad1813dd067233b4 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 11 Dec 2024 10:49:23 -0800 Subject: [PATCH 11/87] [V1] Use input_ids as input for text-only models (#11032) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8d9976ded7c5e..e75be21ef2d91 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -61,6 +61,7 @@ def __init__( self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] + self.is_multimodal_model = model_config.is_multimodal_model self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len @@ -103,6 +104,11 @@ def __init__( # The batch sizes in the config are in descending order. self.cudagraph_batch_sizes = list( reversed(self.vllm_config.compilation_config.capture_sizes)) + + # Persistent buffers for CUDA graphs. + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=self.device) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc_np[0] = 0 np.cumsum(seq_lens, out=seq_start_loc_np[1:]) - input_ids = input_ids.to(self.device, non_blocking=True) + self.input_ids[:total_num_scheduled_tokens].copy_(input_ids, + non_blocking=True) self.positions[:total_num_scheduled_tokens].copy_(positions, non_blocking=True) query_start_loc = query_start_loc.to(self.device, non_blocking=True) @@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # token from the partial request. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 - return input_ids, attn_metadata, logits_indices + return attn_metadata, logits_indices def _prepare_sampling( self, @@ -427,13 +434,15 @@ def execute_model( ) -> ModelRunnerOutput: self._update_states(scheduler_output) - # Run the encoder. - self._execute_encoder(scheduler_output) - encoder_outputs = self._gather_encoder_outputs(scheduler_output) + if self.is_multimodal_model: + # Run the multimodal encoder if any. + self._execute_encoder(scheduler_output) + encoder_outputs = self._gather_encoder_outputs(scheduler_output) + else: + encoder_outputs = [] # Prepare the decoder inputs. - input_ids, attn_metadata, logits_indices = self._prepare_inputs( - scheduler_output) + attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -444,29 +453,39 @@ def execute_model( else: # Eager mode. num_input_tokens = num_scheduled_tokens - attn_metadata.num_input_tokens = num_input_tokens - # Get the inputs embeds. - if encoder_outputs: - inputs_embeds = self.model.get_input_embeddings( - input_ids, encoder_outputs) + if self.is_multimodal_model: + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + input_ids = self.input_ids[:num_scheduled_tokens] + if encoder_outputs: + inputs_embeds = self.model.get_input_embeddings( + input_ids, encoder_outputs) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None else: - inputs_embeds = self.model.get_input_embeddings(input_ids) - # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings), - # always use embeddings (rather than token ids) as input to the model. - # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # then the embedding layer is not included in the CUDA graph. + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): hidden_states = self.model( - input_ids=None, + input_ids=input_ids, positions=self.positions[:num_input_tokens], kv_caches=self.kv_caches, attn_metadata=None, - inputs_embeds=self.inputs_embeds[:num_input_tokens], + inputs_embeds=inputs_embeds, ) hidden_states = hidden_states[:num_scheduled_tokens] hidden_states = hidden_states[logits_indices] @@ -534,13 +553,20 @@ def _dummy_run( num_tokens: int, kv_caches: List[torch.Tensor], ) -> torch.Tensor: + if self.is_multimodal_model: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_tokens] + else: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = None with set_forward_context(None, self.vllm_config): hidden_states = model( - input_ids=None, + input_ids=input_ids, positions=self.positions[:num_tokens], kv_caches=kv_caches, attn_metadata=None, - inputs_embeds=self.inputs_embeds[:num_tokens]) + inputs_embeds=inputs_embeds, + ) return hidden_states def profile_run(self) -> None: From 66aaa7722df3d7ef9e9bd2942cab5cd0d7473174 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 11 Dec 2024 10:59:50 -0800 Subject: [PATCH 12/87] [torch.compile] remove graph logging in ci (#11110) Signed-off-by: youkaichao --- vllm/compilation/backends.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 09a3daa731829..4a5dc337d01b8 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -287,9 +287,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: graph, self.compilation_config.splitting_ops) from torch._dynamo.utils import lazy_format_graph_code - logger.debug("%s", lazy_format_graph_code("before split", self.graph)) - logger.debug("%s", lazy_format_graph_code("after split", - self.split_gm)) + + # depyf will hook lazy_format_graph_code and dump the graph + # for debugging, no need to print the graph here + lazy_format_graph_code("before split", self.graph) + lazy_format_graph_code("after split", self.split_gm) compilation_counter.num_piecewise_graphs_seen += len( self.piecewise_graphs) From 72ff3a968682e6a3f7620ab59f2baf5e8eb2777b Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:36:35 -0800 Subject: [PATCH 13/87] [core] Bump ray to use _overlap_gpu_communication in compiled graph tests (#10410) Signed-off-by: Rui Qiao Signed-off-by: Rui Qiao Co-authored-by: Rui Qiao --- requirements-test.in | 2 +- requirements-test.txt | 2 +- vllm/envs.py | 8 ++++++++ vllm/executor/ray_gpu_executor.py | 17 ++++++++++------- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/requirements-test.in b/requirements-test.in index c0b228148ab31..57fddb416317e 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -13,7 +13,7 @@ einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests peft -ray[adag]==2.35 +ray[adag]==2.40.0 sentence-transformers # required for embedding tests soundfile # required for audio tests timm # required for internvl test diff --git a/requirements-test.txt b/requirements-test.txt index 8ceb705cdffd7..c786a1249bddb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -410,7 +410,7 @@ pyyaml==6.0.2 # ray # timm # transformers -ray[adag]==2.35.0 +ray[adag]==2.40.0 # via -r requirements-test.in redis==5.2.0 # via tensorizer diff --git a/vllm/envs.py b/vllm/envs.py index be5d9985b63a4..bc8c1499e9534 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -45,6 +45,7 @@ VLLM_USE_RAY_SPMD_WORKER: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 @@ -337,6 +338,13 @@ def get_default_config_root(): lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1")) ), + # If the env var is set, it enables GPU communication overlap in + # Ray's compiled DAG. This flag is ignored if + # VLLM_USE_RAY_COMPILED_DAG is not set. + "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": + lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1")) + ), + # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 4263fb27265f6..4bf5cbbd18ffe 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -414,12 +414,10 @@ def _check_ray_adag_installation(self): import pkg_resources from packaging import version - required_version = version.parse("2.35") + required_version = version.parse("2.40") current_version = version.parse( pkg_resources.get_distribution("ray").version) - # TODO: update the constraint once we adapt to the backward - # incompatible API change from ray 2.36 - if current_version != required_version: + if current_version < required_version: raise ValueError(f"Ray version {required_version} is " f"required, but found {current_version}") @@ -445,6 +443,8 @@ def _compiled_ray_dag(self, enable_asyncio: bool): logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s", envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL) + logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s", + envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) with InputNode() as input_data: # Example DAG: PP=2, TP=4 # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501 @@ -480,7 +480,10 @@ def _compiled_ray_dag(self, enable_asyncio: bool): forward_dag = MultiOutputNode(outputs) - return forward_dag.experimental_compile(enable_asyncio=enable_asyncio) + return forward_dag.experimental_compile( + enable_asyncio=enable_asyncio, + _overlap_gpu_communication=envs. + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) def __del__(self): self.shutdown() @@ -507,8 +510,8 @@ async def execute_model_async( serialized_data = self.input_encoder.encode(execute_model_req) dag_future = await self.forward_dag.execute_async(serialized_data) - outputs = await dag_future - return self.output_decoder.decode(outputs[0]) + output = await dag_future[0] + return self.output_decoder.decode(output) async def _driver_execute_model_async( self, From d1e21a979bba4712f48dac1bbf410e0b57c92e7a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 12 Dec 2024 06:18:16 +0800 Subject: [PATCH 14/87] [CI/Build] Split up VLM tests (#11083) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 32 ++++++--- pyproject.toml | 3 +- .../vision_language/test_models.py | 72 ++++++++++++------- tests/utils.py | 37 ++++++---- 4 files changed, 94 insertions(+), 50 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index df4fa7a6ee9ba..aca505178df06 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -321,7 +321,7 @@ steps: ##### models test ##### -- label: Basic Models Test # 30min +- label: Basic Models Test # 24min source_file_dependencies: - vllm/ - tests/models @@ -331,7 +331,7 @@ steps: - pytest -v -s models/test_registry.py - pytest -v -s models/test_initialization.py -- label: Language Models Test (Standard) # 42min +- label: Language Models Test (Standard) # 32min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -342,7 +342,7 @@ steps: - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model -- label: Language Models Test (Extended) # 50min +- label: Language Models Test (Extended) # 1h10min optional: true source_file_dependencies: - vllm/ @@ -353,7 +353,7 @@ steps: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/language -m 'not core_model' -- label: Multi-Modal Models Test (Standard) # 26min +- label: Multi-Modal Models Test (Standard) # 28min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -369,7 +369,7 @@ steps: - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Multi-Modal Models Test (Extended) # 1h15m +- label: Multi-Modal Models Test (Extended) 1 # 1h16m optional: true source_file_dependencies: - vllm/ @@ -380,14 +380,24 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' + - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/vision_language -m 'not core_model' - pytest -v -s models/encoder_decoder/language -m 'not core_model' - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' +- label: Multi-Modal Models Test (Extended) 2 # 38m + optional: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/vision_language + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' + # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test optional: true @@ -446,11 +456,11 @@ steps: - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' # Avoid importing model tests that cause CUDA reinitialization error - - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus + - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py @@ -540,7 +550,7 @@ steps: # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional diff --git a/pyproject.toml b/pyproject.toml index 253b706a774a7..c5a14ecf5aea9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,7 +96,8 @@ markers = [ "core_model: enable this model test in each PR instead of only nightly", "cpu_model: enable this model test in CPU tests", "quant_model: run this model test under Quantized category", - "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", + "split: run this test as part of a split", + "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", ] diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index ed8f34a677f84..3101d1d2ea831 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -1,7 +1,9 @@ """Common tests for testing .generate() functionality for single / multiple image, embedding, and video support for different VLMs in vLLM. """ +import math import os +from collections import defaultdict from pathlib import PosixPath from typing import Type @@ -10,11 +12,12 @@ from transformers.utils import is_flash_attn_2_available from vllm.platforms import current_platform -from vllm.utils import cuda_device_count_stateless, identity +from vllm.utils import identity from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets, _VideoAssets) -from ....utils import fork_new_process_for_each_test, large_gpu_mark +from ....utils import (fork_new_process_for_each_test, large_gpu_mark, + multi_gpu_marks) from ...utils import check_outputs_equal from .vlm_utils import custom_inputs, model_utils, runners from .vlm_utils.case_filtering import get_parametrized_options @@ -382,7 +385,7 @@ prompt_path_encoder=model_utils.qwen_prompt_path_encoder, ), ### Tensor parallel / multi-gpu broadcast tests - "broadcast-chameleon": VLMTestInfo( + "chameleon-broadcast": VLMTestInfo( models=["facebook/chameleon-7b"], prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, @@ -393,43 +396,25 @@ vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], hf_output_post_proc = lambda hf_output, model: hf_output[:2], comparator=check_outputs_equal, - marks=[ - pytest.mark.distributed_2_gpus, - pytest.mark.skipif( - cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.", - ), - ], + marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore ), - "broadcast-llava": VLMTestInfo( + "llava-broadcast": VLMTestInfo( models=["llava-hf/llava-1.5-7b-hf"], prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - marks=[ - pytest.mark.distributed_2_gpus, - pytest.mark.skipif( - cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.", - ) - ], + marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore ), - "broadcast-llava_next": VLMTestInfo( + "llava_next-broadcast": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", max_model_len=10240, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - marks=[ - pytest.mark.distributed_2_gpus, - pytest.mark.skipif( - cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.", - ) - ], + marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore ), ### Custom input edge-cases for specific models @@ -468,6 +453,41 @@ # yapf: enable +def _mark_splits( + test_settings: dict[str, VLMTestInfo], + *, + num_groups: int, +) -> dict[str, VLMTestInfo]: + name_by_test_info_id = {id(v): k for k, v in test_settings.items()} + test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list) + + for info in test_settings.values(): + for model in info.models: + test_infos_by_model[model].append(info) + + models = sorted(test_infos_by_model.keys()) + split_size = math.ceil(len(models) / num_groups) + + new_test_settings = dict[str, VLMTestInfo]() + + for i in range(num_groups): + models_in_group = models[i * split_size:(i + 1) * split_size] + + for model in models_in_group: + for info in test_infos_by_model[model]: + new_marks = (info.marks or []) + [pytest.mark.split(group=i)] + new_info = info._replace(marks=new_marks) + new_test_settings[name_by_test_info_id[id(info)]] = new_info + + missing_keys = test_settings.keys() - new_test_settings.keys() + assert not missing_keys, f"Missing keys: {missing_keys}" + + return new_test_settings + + +VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) + + ### Test wrappers # Wrappers around the core test running func for: # - single image diff --git a/tests/utils.py b/tests/utils.py index a893667e144a6..afeb708f3bcdc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -682,10 +682,12 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: - """Gets a pytest skipif mark, which triggers ig the the device doesn't have - meet a minimum memory requirement in gb; can be leveraged via - @large_gpu_test to skip tests in environments without enough resources, or - called when filtering tests to run directly. + """ + Get a pytest mark, which skips the test if the GPU doesn't meet + a minimum memory requirement in GB. + + This can be leveraged via `@large_gpu_test` to skip tests in environments + without enough resources, or called when filtering tests to run directly. """ try: if current_platform.is_cpu(): @@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int): Currently, the CI machine uses L4 GPU which has 24 GB VRAM. """ - test_skipif = large_gpu_mark(min_gb) + mark = large_gpu_mark(min_gb) def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: - return test_skipif(f) + return mark(f) return wrapper -def multi_gpu_test(*, num_gpus: int): - """ - Decorate a test to be run only when multiple GPUs are available. - """ - test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus") +def multi_gpu_marks(*, num_gpus: int): + """Get a collection of pytest marks to apply for `@multi_gpu_test`.""" + test_selector = pytest.mark.distributed(num_gpus=num_gpus) test_skipif = pytest.mark.skipif( cuda_device_count_stateless() < num_gpus, reason=f"Need at least {num_gpus} GPUs to run the test.", ) + return [test_selector, test_skipif] + + +def multi_gpu_test(*, num_gpus: int): + """ + Decorate a test to be run only when multiple GPUs are available. + """ + marks = multi_gpu_marks(num_gpus=num_gpus) + def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: - return test_selector(test_skipif(fork_new_process_for_each_test(f))) + func = fork_new_process_for_each_test(f) + for mark in reversed(marks): + func = mark(func) + + return func return wrapper From 452a723bf2e8410ee9b47f82f90c7ea48aa6d14f Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 11 Dec 2024 18:34:54 -0500 Subject: [PATCH 15/87] [V1][Core] Remove should_shutdown to simplify core process termination (#11113) Signed-off-by: Tyler Michael Smith --- vllm/v1/engine/core.py | 13 ++----------- vllm/v1/engine/core_client.py | 6 ------ 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 55a5c4dff3a5c..a26ffe74a3ae8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -5,7 +5,6 @@ import threading import time from multiprocessing.process import BaseProcess -from multiprocessing.sharedctypes import Synchronized from typing import List, Tuple, Type, Union import zmq @@ -133,13 +132,9 @@ def __init__( input_path: str, output_path: str, ready_path: str, - should_shutdown: Synchronized, ): super().__init__(vllm_config, executor_class, usage_context) - # Signal from main process to shutdown (multiprocessing.Value). - self.should_shutdown = should_shutdown - # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, # and to overlap some serialization/deserialization with the @@ -195,7 +190,6 @@ def make_engine_core_process( input_path: str, output_path: str, ready_path: str, - should_shutdown: Synchronized, ) -> BaseProcess: # The current process might have CUDA context, # so we need to spawn a new process. @@ -210,7 +204,6 @@ def make_engine_core_process( "vllm_config": vllm_config, "executor_class": executor_class, "usage_context": usage_context, - "should_shutdown": should_shutdown } # Run EngineCore busy loop in background process. proc = context.Process(target=EngineCoreProc.run_engine_core, @@ -260,8 +253,8 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the EngineCore.""" - # Loop until we get a shutdown signal. - while not self.should_shutdown: + # Loop until process is sent a SIGINT or SIGTERM + while True: # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: @@ -272,8 +265,6 @@ def run_busy_loop(self): except queue.Empty: self._log_stats() logger.debug("EngineCore busy loop waiting.") - if self.should_shutdown: - return except BaseException: raise diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4d96b323d1662..1d5ddf4db4d7c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,5 +1,4 @@ import atexit -import multiprocessing from typing import List, Union import msgspec @@ -149,21 +148,16 @@ def __init__( self.input_socket.bind(input_path) # Start EngineCore in background process. - self.should_shutdown = multiprocessing.Value('b', False, lock=False) self.proc = EngineCoreProc.make_engine_core_process( *args, input_path=input_path, output_path=output_path, ready_path=ready_path, - should_shutdown=self.should_shutdown, **kwargs, ) atexit.register(self.shutdown) def shutdown(self): - # Send shutdown signal to background process. - self.should_shutdown = True - # Shut down the zmq context. self.ctx.destroy(linger=0) From 4e116833686f3e0c0a223b05b5859ad76843a017 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Wed, 11 Dec 2024 19:55:30 -0500 Subject: [PATCH 16/87] [V1] VLM preprocessor hashing (#11020) Signed-off-by: Roger Wang Signed-off-by: Alexander Matveev Co-authored-by: Michael Goin Co-authored-by: Roger Wang --- examples/offline_inference_vision_language.py | 126 ++++++++++++-- requirements-common.txt | 1 + tests/v1/engine/test_engine_core.py | 1 + tests/v1/engine/test_engine_core_client.py | 1 + vllm/config.py | 10 +- vllm/engine/arg_utils.py | 8 + vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/core.py | 18 +- vllm/v1/engine/mm_input_mapper.py | 156 ++++++++++++++++-- vllm/v1/engine/processor.py | 35 ++-- vllm/v1/utils.py | 21 +++ 11 files changed, 332 insertions(+), 48 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index c6a274ee5894b..5e210126dc8fe 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -5,6 +5,8 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ +import random + from transformers import AutoTokenizer from vllm import LLM, SamplingParams @@ -23,7 +25,9 @@ def run_llava(question: str, modality: str): prompt = f"USER: \n{question}\nASSISTANT:" - llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096) + llm = LLM(model="llava-hf/llava-1.5-7b-hf", + max_model_len=4096, + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids @@ -33,7 +37,9 @@ def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", + max_model_len=8192, + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids @@ -44,7 +50,9 @@ def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: