vllm-project · Isotr0py · Dec 11, 2024 · Dec 11, 2024
diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.rst
@@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
 
     Q: Which model to use for offline inference embedding?
 
-A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
+A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
+more are listed :ref:`here <supported_models>`.
+
+By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
+`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
+but they are expected be inferior to models that are specifically trained on embedding tasks.
 
 ----------------------------------------
 

diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
@@ -14,7 +14,7 @@
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
 
-# Placeholder attention backend for models like Mamba and embedding models that
+# Placeholder attention backend for models like Mamba and pooling models that
 # lack attention.
 
 

diff --git a/vllm/config.py b/vllm/config.py
@@ -152,7 +152,7 @@ class ModelConfig:
             this argument will be used to configure the neuron config that
             can not be gathered from the vllm arguments.
         override_pooler_config: Initialize non default pooling config or
-            override default pooling config for the embedding model.
+            override default pooling config for the pooling model.
     """
 
     def __init__(
@@ -576,7 +576,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Async postprocessor is not necessary with embedding mode
+        # Async postprocessor is not necessary for pooling models
         # since there is no token generation
         if self.runner_type == "pooling":
             self.use_async_output_proc = False
@@ -1825,11 +1825,11 @@ class MultiModalConfig:
 
 @dataclass
 class PoolerConfig:
-    """Controls the behavior of output pooling in embedding models."""
+    """Controls the behavior of output pooling in pooling models."""
 
     pooling_type: Optional[str] = None
     """
-    The pooling method of the embedding model. This should be a key in
+    The pooling method of the pooling model. This should be a key in
     :class:`vllm.model_executor.layers.pooler.PoolingType`.
     """
 

@@ -8,7 +8,7 @@
 class PlaceholderBlockSpaceManager(BlockSpaceManager):
     """A version of BlockSpaceManager for use in environments
     where block management is not required. 
-    For example: embedding models or attention-free models like Mamba.
+    For example: pooling models or attention-free models like Mamba.
 
     This class provides the same interface as BlockSpaceManager, but its
     methods perform no actions or return simple values like True in specific

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -893,7 +893,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--override-pooler-config',
             type=PoolerConfig.from_json,
             default=None,
-            help="Override or set the pooling method in the embedding model. "
+            help="Override or set the pooling method for pooling models. "
             "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
 
         parser.add_argument('--compilation-config',
@@ -1085,7 +1085,7 @@ def create_engine_config(self,
                 "setting --max-model-len to a smaller value.", max_model_len)
         elif (self.enable_chunked_prefill
               and model_config.runner_type == "pooling"):
-            msg = "Chunked prefill is not supported for embedding models"
+            msg = "Chunked prefill is not supported for pooling models"
             raise ValueError(msg)
 
 

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -1085,7 +1085,7 @@ async def encode(
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+        """Generate outputs for a request from a pooling model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
         request into the waiting queue of the LLMEngine and streams the outputs

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -527,7 +527,7 @@ def encode(
         *,
         inputs: Optional[PromptType] = None  # DEPRECATED
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+        """Generate outputs for a request from a pooling model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
         request into the waiting queue of the LLMEngine and streams the outputs

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -209,7 +209,7 @@ def encode(
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model."""
+        """Generate outputs for a request from a pooling model."""
         ...
 
     @abstractmethod

diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
@@ -119,7 +119,7 @@ async def create_score(
 
             if prompt_adapter_request is not None:
                 raise NotImplementedError("Prompt adapter is not supported "
-                                          "for embedding models")
+                                          "for scoring models")
 
             if isinstance(tokenizer, MistralTokenizer):
                 raise ValueError(

diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -618,9 +618,9 @@ class SequenceGroup:
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
         embeddings: The embeddings vectors of the prompt of the sequence group
-            for an embedding model.
+            for a pooling model.
         pooling_params: The pooling parameters used to generate the pooling
-            for an embedding model.
+            for a pooling model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
@@ -1102,7 +1102,7 @@ class PoolerOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """The output from a pooling operation in the embedding model."""
+    """The output from a pooling operation in the pooling model."""
     outputs: List[EmbeddingSequenceGroupOutput]
 
     # lazy import to avoid circular import

@@ -59,7 +59,7 @@ def process_inputs(
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
 
-        # TODO(woosuk): Support embedding mode.
+        # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Check max_logprobs
         # TODO(woosuk): Support encoder-decoder models.
 

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
@@ -178,7 +178,7 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CPUCacheEngine]
-        # Initialize cpu_cache as embedding models don't initialize kv_caches
+        # Initialize cpu_cache as pooling models don't initialize kv_caches
         self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
 
         # Torch profiler. Enabled and configured through env vars:

diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
@@ -65,8 +65,8 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[HPUCacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.hpu_cache: Optional[List[List[torch.tensor]]] = None
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
+        self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:

@@ -91,7 +91,7 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}