From 5950f555a1d2ce19c30efb24abe03737320d05c1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 01:20:12 +0000
Subject: [PATCH 01/55] [Doc] Group examples into categories (#11782)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .gitignore                                    |   5 +-
 docs/Makefile                                 |   4 +
 docs/requirements-docs.txt                    |   1 +
 docs/source/conf.py                           |   4 +
 docs/source/generate_examples.py              | 264 +++++++++++++++---
 .../examples/examples_index.template.md       |   8 -
 examples/fp8/README.md                        |   6 +-
 .../Otel.md                                   |   0
 .../dummy_client.py                           |   0
 .../README.md                                 |  10 +-
 .../docker-compose.yaml                       |   0
 .../grafana.json                              |   0
 .../prometheus.yaml                           |   0
 13 files changed, 240 insertions(+), 62 deletions(-)
 delete mode 100644 docs/source/getting_started/examples/examples_index.template.md
 rename examples/{production_monitoring => opentelemetry}/Otel.md (100%)
 rename examples/{production_monitoring => opentelemetry}/dummy_client.py (100%)
 rename examples/{production_monitoring => prometheus_grafana}/README.md (95%)
 rename examples/{production_monitoring => prometheus_grafana}/docker-compose.yaml (100%)
 rename examples/{production_monitoring => prometheus_grafana}/grafana.json (100%)
 rename examples/{production_monitoring => prometheus_grafana}/prometheus.yaml (100%)

diff --git a/.gitignore b/.gitignore
index bb7e4d5b244a8..89dab8f13bab1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,10 +79,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
-docs/source/getting_started/examples/*.rst
-!**/*.template.rst
-docs/source/getting_started/examples/*.md
-!**/*.template.md
+docs/source/getting_started/examples/
 
 # PyBuilder
 .pybuilder/
diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf1020d5..5b801f79d1f26 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -18,3 +18,7 @@ help:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	rm -rf "$(SOURCEDIR)/getting_started/examples"
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 25a700033cc9e..64cf6ef8fc19d 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
+sphinx-togglebutton==0.3.2
 msgspec
 cloudpickle
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 71394c5302a39..1ce11fe057071 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,10 @@
     "sphinx.ext.autosummary",
     "myst_parser",
     "sphinxarg.ext",
+    "sphinx_togglebutton",
+]
+myst_enable_extensions = [
+    "colon_fence",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index aef32f7559f74..32bb86c469c78 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -1,54 +1,234 @@
+import itertools
 import re
+from dataclasses import dataclass, field
 from pathlib import Path
 
+ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
+ROOT_DIR_RELATIVE = '../../../..'
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
+
 
 def fix_case(text: str) -> str:
-    subs = [
-        ("api", "API"),
-        ("llm", "LLM"),
-        ("vllm", "vLLM"),
-        ("openai", "OpenAI"),
-        ("multilora", "MultiLoRA"),
-    ]
-    for sub in subs:
-        text = re.sub(*sub, text, flags=re.IGNORECASE)
+    subs = {
+        "api": "API",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "tpu": "TPU",
+        "aqlm": "AQLM",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
     return text
 
 
-def generate_title(filename: str) -> str:
-    # Turn filename into a title
-    title = filename.replace("_", " ").title()
-    # Handle acronyms and names
-    title = fix_case(title)
-    return f"# {title}"
+@dataclass
+class Index:
+    """
+    Index class to generate a structured document index.
+
+    Attributes:
+        path (Path): The path save the index file to.
+        title (str): The title of the index.
+        description (str): A brief description of the index.
+        caption (str): An optional caption for the table of contents.
+        maxdepth (int): The maximum depth of the table of contents. Defaults to 1.
+        documents (list[str]): A list of document paths to include in the index. Defaults to an empty list.
+
+    Methods:
+        generate() -> str:
+            Generates the index content as a string in the specified format.
+    """ # noqa: E501
+    path: Path
+    title: str
+    description: str
+    caption: str
+    maxdepth: int = 1
+    documents: list[str] = field(default_factory=list)
+
+    def generate(self) -> str:
+        content = f"# {self.title}\n\n{self.description}\n\n"
+        content += "```{toctree}\n"
+        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
+        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        return content
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): List of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """ # noqa: E501
+    path: Path
+    category: str = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """ # noqa: E501
+        return self.path if self.path.is_file() else list(
+            self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """ # noqa: E501
+        if self.path.is_file():
+            return []
+        is_other_file = lambda file: file.is_file() and file != self.main_file
+        return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+    def determine_title(self) -> str:
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def generate(self) -> str:
+        # Convert the path to a relative path from __file__
+        make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to(
+            ROOT_DIR)
+
+        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+        if self.main_file.suffix == ".py":
+            content += f"# {self.title}\n\n"
+        include = "include" if self.main_file.suffix == ".md" else \
+            "literalinclude"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in self.other_files:
+            include = "include" if file.suffix == ".md" else "literalinclude"
+            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
+            content += ":class: dropdown\n\n"
+            content += f":::{{{include}}} {make_relative(file)}\n:::\n"
+            content += ":::\n\n"
+
+        return content
 
 
 def generate_examples():
-    root_dir = Path(__file__).parent.parent.parent.resolve()
-
-    # Source paths
-    script_dir = root_dir / "examples"
-    script_paths = sorted(script_dir.glob("*.py"))
-
-    # Destination paths
-    doc_dir = root_dir / "docs/source/getting_started/examples"
-    doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths]
-
-    # Generate the example docs for each example script
-    for script_path, doc_path in zip(script_paths, doc_paths):
-        # Make script_path relative to doc_path and call it include_path
-        include_path = '../../../..' / script_path.relative_to(root_dir)
-        content = (f"{generate_title(doc_path.stem)}\n\n"
-                   f"Source: <gh-file:examples/{script_path.name}>.\n\n"
-                   f"```{{literalinclude}} {include_path}\n"
-                   ":language: python\n"
-                   ":linenos:\n```")
+    # Create the EXAMPLE_DOC_DIR if it doesn't exist
+    if not EXAMPLE_DOC_DIR.exists():
+        EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+    # Create empty indices
+    examples_index = Index(
+        path=EXAMPLE_DOC_DIR / "examples_index.md",
+        title="Examples",
+        description=
+        "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
+        caption="Examples",
+        maxdepth=1)  # TODO change to 2 when examples start being categorised
+    category_indices = {
+        "offline_inference":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
+            description=
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            caption="Examples",
+        ),
+        "online_serving":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md",
+            title="Online Serving",
+            description=
+            "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
+            caption="Examples",
+        ),
+        "other":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
+            description=
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            caption="Examples",
+        ),
+    }
+
+    examples = []
+    # Find categorised examples
+    for category in category_indices:
+        category_dir = EXAMPLE_DIR / category
+        py = category_dir.glob("*.py")
+        md = category_dir.glob("*.md")
+        for path in itertools.chain(py, md):
+            examples.append(Example(path, category))
+        # Find examples in subdirectories
+        for path in category_dir.glob("*/*.md"):
+            examples.append(Example(path.parent, category))
+    # Find uncategorised examples
+    py = EXAMPLE_DIR.glob("*.py")
+    md = EXAMPLE_DIR.glob("*.md")
+    for path in itertools.chain(py, md):
+        examples.append(Example(path))
+    # Find examples in subdirectories
+    for path in EXAMPLE_DIR.glob("*/*.md"):
+        # Skip categorised examples
+        if path.parent.name in category_indices:
+            continue
+        examples.append(Example(path.parent))
+
+    # Generate the example documentation
+    for example in examples:
+        doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
-            f.write(content)
-
-    # Generate the toctree for the example scripts
-    with open(doc_dir / "examples_index.template.md") as f:
-        examples_index = f.read()
-    with open(doc_dir / "examples_index.md", "w+") as f:
-        example_docs = "\n".join(path.stem + ".md" for path in script_paths)
-        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
+            f.write(example.generate())
+        # Add the example to the appropriate index
+        index = category_indices.get(example.category, examples_index)
+        index.documents.append(example.path.stem)
+
+    # Generate the index files
+    for category_index in category_indices.values():
+        if category_index.documents:
+            examples_index.documents.insert(0, category_index.path.name)
+            with open(category_index.path, "w+") as f:
+                f.write(category_index.generate())
+
+    with open(examples_index.path, "w+") as f:
+        f.write(examples_index.generate())
diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md
deleted file mode 100644
index de7a91c0ffa48..0000000000000
--- a/docs/source/getting_started/examples/examples_index.template.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Examples
-
-```{toctree}
-:maxdepth: 1
-:caption: Scripts
-
-%EXAMPLE_DOCS%
-```
\ No newline at end of file
diff --git a/examples/fp8/README.md b/examples/fp8/README.md
index 181c36558fcff..5492872cae93a 100644
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
@@ -56,7 +56,7 @@ python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> -
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
-```python
+```
 # prerequisites:
 # -  LLaMa 2 kv_cache_scales.json file
 
@@ -90,7 +90,7 @@ optional arguments:
   --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
   --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
 ```
-```
 Example:
+```console
 python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
-```python
+```
diff --git a/examples/production_monitoring/Otel.md b/examples/opentelemetry/Otel.md
similarity index 100%
rename from examples/production_monitoring/Otel.md
rename to examples/opentelemetry/Otel.md
diff --git a/examples/production_monitoring/dummy_client.py b/examples/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/production_monitoring/dummy_client.py
rename to examples/opentelemetry/dummy_client.py
diff --git a/examples/production_monitoring/README.md b/examples/prometheus_grafana/README.md
similarity index 95%
rename from examples/production_monitoring/README.md
rename to examples/prometheus_grafana/README.md
index 807c0470e7b30..c49e5306a1cb4 100644
--- a/examples/production_monitoring/README.md
+++ b/examples/prometheus_grafana/README.md
@@ -1,4 +1,4 @@
-# vLLM + Prometheus/Grafana 
+# Prometheus and Grafana 
 
 This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
 
@@ -6,7 +6,7 @@ Install:
 - [`docker`](https://docs.docker.com/engine/install/)
 - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
 
-### Launch
+## Launch
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
@@ -35,11 +35,11 @@ python3 ../../benchmarks/benchmark_serving.py \
 
 Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
 
-### Grafana Dashboard
+## Grafana Dashboard
 
 Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
 
-#### Add Prometheus Data Source
+### Add Prometheus Data Source
 
 Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
 
@@ -47,7 +47,7 @@ On Prometheus configuration page, we need to add the `Prometheus Server URL` in
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
-#### Import Dashboard 
+### Import Dashboard 
 
 Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
 
diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/production_monitoring/docker-compose.yaml
rename to examples/prometheus_grafana/docker-compose.yaml
diff --git a/examples/production_monitoring/grafana.json b/examples/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/production_monitoring/grafana.json
rename to examples/prometheus_grafana/grafana.json
diff --git a/examples/production_monitoring/prometheus.yaml b/examples/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/production_monitoring/prometheus.yaml
rename to examples/prometheus_grafana/prometheus.yaml

From 91445c7bc8000a6f6f1efed0882076d7001be968 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 10:17:16 +0800
Subject: [PATCH 02/55] [Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 ...e_inference_vision_language_multi_image.py | 41 ++++++++++++++++---
 vllm/model_executor/models/llava.py           |  6 +++
 vllm/model_executor/models/pixtral.py         |  2 +-
 vllm/model_executor/models/utils.py           |  9 ++++
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 6af8d7768e75d..cf2e90a325c6a 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -23,7 +23,7 @@
 class ModelRequestData(NamedTuple):
     llm: LLM
     prompt: str
-    stop_token_ids: Optional[List[str]]
+    stop_token_ids: Optional[List[int]]
     image_data: List[Image]
     chat_template: Optional[str]
 
@@ -44,12 +44,14 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
     prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None)
+        chat_template=None,
+    )
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -166,7 +168,8 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    placeholders = "<|image|>" * len(image_urls)
+    prompt = f"{placeholders}<|begin_of_text|>{question}"
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -209,6 +212,31 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "mistral-community/pixtral-12b"
+
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -244,7 +272,8 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_qwen_vl_chat(question: str,
+                      image_urls: List[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     llm = LLM(
         model=model_name,
@@ -274,6 +303,7 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
 
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -348,7 +378,8 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
-    "qwen_vl_chat": load_qwenvl_chat,
+    "pixtral_hf": load_pixtral_hf,
+    "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
 }
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4299af8cd03a2..305f1364dba23 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -546,6 +546,12 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            if self.config.vision_config.model_type == "pixtral":
+                return LlavaImagePixelInputs(
+                    type="pixel_values",
+                    data=flatten_bn(pixel_values),
+                )
+
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 9e1d38512c0b4..b74bb3c8a3f88 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -774,7 +774,7 @@ def get_num_image_tokens(
     ) -> int:
         return get_pixtral_hf_image_feature_size(
             image_size=self.vision_config.image_size,
-            patch_size=self.get_image_size(),
+            patch_size=self.vision_config.patch_size,
         )
 
     def get_max_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 31017f16d3c97..4ed3b237ae0e2 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -281,6 +281,15 @@ def flatten_bn(
     ...
 
 
+@overload
+def flatten_bn(
+    x: Union[List[torch.Tensor], torch.Tensor],
+    *,
+    concat: bool = False,
+) -> Union[List[torch.Tensor], torch.Tensor]:
+    ...
+
+
 def flatten_bn(
     x: Union[List[torch.Tensor], torch.Tensor],
     *,

From 4d29e91be84d27ca313d657eee92c067439a4c23 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 7 Jan 2025 20:57:04 -0600
Subject: [PATCH 03/55] [Misc] sort torch profiler table by kernel timing
 (#11813)

---
 benchmarks/benchmark_latency.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 0a14aedd5feba..e669ce4db299d 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -52,7 +52,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
                 llm.generate(dummy_prompts,
                              sampling_params=sampling_params,
                              use_tqdm=False)
-            print(p.key_averages())
+            print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
             llm.generate(dummy_prompts,

From dc71af0a71f347badcd917810440fad136e73ba6 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Wed, 8 Jan 2025 12:09:25 +0800
Subject: [PATCH 04/55] =?UTF-8?q?Remove=20the=20duplicate=20imports=20of?=
 =?UTF-8?q?=20MultiModalKwargs=20and=20PlaceholderRange=E2=80=A6=20(#11824?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm/v1/core/scheduler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index baaf3329dc79f..b26716f5c02e6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -5,8 +5,6 @@
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.base import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager

From b640b19cc0babe256c5455befe95340f951763d9 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Wed, 8 Jan 2025 10:35:37 +0530
Subject: [PATCH 05/55] Fixed docker build for ppc64le (#11518)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
---
 Dockerfile.ppc64le | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 971248577983f..d3cd1c7b313bc 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,7 +4,7 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
@@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
-# These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
-    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \

From f4923cb8bce7d9c3038ad6c597ae1ff3ed90fe93 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 09:08:30 +0400
Subject: [PATCH 06/55] [OpenVINO] Fixed Docker.openvino build (#11732)

Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 Dockerfile.openvino | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 8bd188ffde408..32bcbfa9cc168 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
+RUN python3 -m pip install -U pip
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend

From f645eb69545672d394e9e9e0ce46c725504fd2a0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 8 Jan 2025 13:08:48 +0800
Subject: [PATCH 07/55] [Bugfix] Add checks for LoRA and CPU offload (#11810)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 8b824a1fca511..a9b6d6b19127f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2051,6 +2051,11 @@ def __post_init__(self):
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
                 f"max_loras ({self.max_loras})")
 
+    def verify_with_cache_config(self, cache_config: CacheConfig):
+        # TODO LoRA supports CPU offload.
+        if cache_config.cpu_offload_gb > 0:
+            raise ValueError("CPU offload is not supported with LoRA yet.")
+
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
             self.lora_dtype = model_config.dtype
@@ -3138,6 +3143,7 @@ def __post_init__(self):
             self.cache_config.verify_with_parallel_config(self.parallel_config)
 
         if self.lora_config:
+            self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)

From 259abd8953a8fea9abf3c4e66aa7c51391fa5b64 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 21:16:08 -0800
Subject: [PATCH 08/55] [Docs] reorganize sponsorship page (#11639)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                         | 15 ++++++++++-----
 docs/source/community/sponsors.md | 14 ++++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 652268ec29cac..8e85b460363fc 100644
--- a/README.md
+++ b/README.md
@@ -90,28 +90,33 @@ vLLM is a community project. Our compute resources for development and testing a
 
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
-
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
 
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index c6f83b3a92ca0..3d5a57baefbde 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -5,26 +5,32 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with README.md. -->
 
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

From ef68eb28d8d45be6e0defe82245e16be9362e375 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 13:40:09 +0800
Subject: [PATCH 09/55] [Bug] Fix pickling of `ModelConfig` when RunAI Model
 Streamer is used (#11825)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index a9b6d6b19127f..44426489f686a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -381,16 +381,16 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
         """
         if is_s3(model) or is_s3(tokenizer):
             if is_s3(model):
-                self.s3_model = S3Model()
-                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                s3_model = S3Model()
+                s3_model.pull_files(model, allow_pattern=["*config.json"])
                 self.model_weights = self.model
-                self.model = self.s3_model.dir
+                self.model = s3_model.dir
 
             if is_s3(tokenizer):
-                self.s3_tokenizer = S3Model()
-                self.s3_tokenizer.pull_files(
+                s3_tokenizer = S3Model()
+                s3_tokenizer.pull_files(
                     model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
-                self.tokenizer = self.s3_tokenizer.dir
+                self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]

From 889e662eae19fe8f30469883c6854ee4df4315a9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:03 +0800
Subject: [PATCH 10/55] [misc] improve memory profiling (#11809)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/test_utils.py                           | 19 +++++-
 .../vllm_test_utils/__init__.py               |  3 +-
 .../vllm_test_utils/monitor.py                | 68 +++++++++++++++++++
 vllm/utils.py                                 | 12 ++--
 4 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/monitor.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 32a6b0aed66aa..0285b00d73be1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from vllm_test_utils import monitor
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
                         get_open_port, memory_profiling, merge_async_iterators,
@@ -289,8 +290,16 @@ def test_memory_profiling():
 
     weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
 
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
     with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+        monitor(measure_current_non_torch) as monitored_values:
         # make a memory spike, 1 GiB
         spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
         del spike
@@ -298,7 +307,15 @@ def test_memory_profiling():
         # Add some extra non-torch memory 256 MiB (simulate NCCL)
         handle2 = lib.cudaMalloc(256 * 1024 * 1024)
 
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
     # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by PyTorch caching allocator,
+    # we cannot control PyTorch's behavior of its internal buffers,
+    # which causes a small error (<10 MiB in practice)
     non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
     torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index bf0b62a5b75e3..6505c81546bb0 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -4,5 +4,6 @@
 """
 
 from .blame import BlameResult, blame
+from .monitor import MonitoredValues, monitor
 
-__all__ = ["blame", "BlameResult"]
+__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
new file mode 100644
index 0000000000000..a237f53a75d18
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -0,0 +1,68 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator, Generic, TypeVar
+
+_T = TypeVar("_T")
+
+
+@dataclasses.dataclass
+class MonitoredValues(Generic[_T]):
+    values: list[_T] = dataclasses.field(default_factory=list)
+    trace_stacks: list[str] = dataclasses.field(default_factory=list)
+
+
+@contextlib.contextmanager
+def monitor(
+    measure_func: Callable[[],
+                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    """
+    Trace the function calls to continuously monitor the change of
+    a value.
+
+    Usage:
+
+    ```python
+
+    def measure_func():
+        ... # measure the current value
+        return current_value
+
+    with monitor(measure_func) as monitored_values:
+        # do something
+    
+        monitored_values.values # all changes of the values
+        monitored_values.trace_stacks # trace stacks of every change
+    ```
+    """
+    monitored_values = MonitoredValues[_T]()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal monitored_values
+        if event in ['line']:
+            # triggered by every line of Python code.
+            # only Python functions will trigger it,
+            # c/cpp functions will not trigger it.
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # do a measurement
+                current_value = measure_func()
+                if len(monitored_values.values
+                       ) == 0 or current_value != monitored_values.values[-1]:
+                    monitored_values.values.append(current_value)
+                    monitored_values.trace_stacks.append("".join(
+                        traceback.format_stack()))
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield monitored_values
+    finally:
+        sys.settrace(None)
diff --git a/vllm/utils.py b/vllm/utils.py
index 63057153f851d..2660b53d7bfb0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1742,10 +1742,10 @@ class MemorySnapshot:
     timestamp: float = 0.0
 
     def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-        self.torch_memory_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
+        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
+        # torch.cuda.memory_reserved() is how many bytes
+        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
+        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
@@ -1822,10 +1822,10 @@ def memory_profiling(
 
     The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
 
-    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
 
     (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
     """ # noqa
     torch.cuda.reset_peak_memory_stats()
 

From ad9f1aa6796297a00456e715043f3eaad55bed53 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:49 +0800
Subject: [PATCH 11/55] [doc] update wheels url (#11830)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation/gpu-cuda.md | 4 ++--
 python_only_dev.py                                   | 2 +-
 setup.py                                             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 1cd513177bf0d..419b8163fc034 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -75,7 +75,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
@@ -126,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
+This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 
diff --git a/python_only_dev.py b/python_only_dev.py
index f70b4984025b3..7d95ac96e6e4b 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -7,7 +7,7 @@
 or
 
 export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 pip install -e .
 """ # noqa
 
diff --git a/setup.py b/setup.py
index ba6953dbdc174..ef9f4e579e84d 100644
--- a/setup.py
+++ b/setup.py
@@ -252,7 +252,7 @@ def run(self):
 
 class repackage_wheel(build_ext):
     """Extracts libraries and other files from an existing wheel."""
-    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
     def run(self) -> None:
         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",

From a1b2b8606e75ab8fbc066e7f0fae20c1e60244ca Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 23:05:46 -0800
Subject: [PATCH 12/55] [Docs] Update sponsor name: 'Novita' to 'Novita AI'
 (#11833)

---
 README.md                         | 2 +-
 docs/source/community/sponsors.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8e85b460363fc..1f82229f39537 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 3d5a57baefbde..9d2af4c13b088 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,7 +22,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox

From cfd3219f5881e2abea1f7c9d2866ed1838c5057b Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 8 Jan 2025 05:35:49 -0300
Subject: [PATCH 13/55] [Hardware][Apple] Native support for macOS Apple
 Silicon (#11696)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 cmake/cpu_extension.cmake                     | 61 ++++++++++++++-----
 csrc/cpu/cpu_types_arm.hpp                    | 61 ++++++++++++++++++-
 csrc/cpu/utils.cpp                            | 23 +++++--
 .../getting_started/installation/cpu-apple.md | 51 ++++++++++++++++
 .../getting_started/installation/cpu-arm.md   |  4 +-
 .../getting_started/installation/index.md     |  1 +
 requirements-cpu.txt                          |  6 +-
 setup.py                                      |  9 ++-
 vllm/config.py                                | 12 ++++
 vllm/entrypoints/openai/api_server.py         |  3 +
 vllm/utils.py                                 |  7 +++
 11 files changed, 209 insertions(+), 29 deletions(-)
 create mode 100644 docs/source/getting_started/installation/cpu-apple.md

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 68f7ca1af05ad..714abca2a5ff7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+
+
 #
 # Define environment variables for special configurations
 #
@@ -13,6 +18,9 @@ endif()
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
+
+set (ENABLE_NUMA TRUE)
+
 #
 # Check the compile flags
 #
@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
         "-mf16c"
     )
 endif()
-list(APPEND CXX_COMPILE_FLAGS
-    "-fopenmp"
-    "-DVLLM_CPU_EXTENSION")
 
-execute_process(COMMAND cat /proc/cpuinfo
-                RESULT_VARIABLE CPUINFO_RET
-                OUTPUT_VARIABLE CPUINFO)
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()
 
-if (NOT CPUINFO_RET EQUAL 0)
-    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
 endif()
 
+
 function (find_isa CPUINFO TARGET OUT)
     string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
     if(NOT ISA_FOUND EQUAL -1)
@@ -54,12 +72,17 @@ endfunction()
 
 is_avx512_disabled(AVX512_DISABLED)
 
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
-find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
-find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
-find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(APPLE_SILICON_FOUND TRUE)
+else()
+    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+endif()
+
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
         set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
     endif()
     list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
 else()
     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
@@ -139,7 +165,12 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
-list(APPEND LIBS numa)
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()
 
 #
 # _C extension
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 73e0f8cb2e0fb..ae062a5b86892 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
                 vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
             }
         }
+
+        // Note: below is the unrolled version of the following code:
+        // 
+        // for (int i = 0; i < remainder; ++i) {
+        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
+        //          vgetq_lane_f16(temp, i);
+        // }
+        // 
+        // For macOS build (Clang), the arm/neon intrinsics function 
+        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
+        // time. 
         
         if (remainder > 0) {
             float16x8_t temp = reg.val[full_blocks];
-            for (int i = 0; i < remainder; ++i) {
-                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+            switch (remainder)
+            {
+            case 1:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              break;
+            case 2:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              break;
+            case 3:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              break;
+            case 4:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              break;
+            case 5:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              break;
+            case 6:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              break;
+            case 7:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+              break;
+            
+            default:
+              break;
             }
         }
     }
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 1138a55df2f05..42a1c1d924bac 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -1,10 +1,22 @@
-#include <numa.h>
-#include <unistd.h>
-#include <string>
-#include <sched.h>
+#ifndef VLLM_NUMA_DISABLED
+  #include <numa.h>
+  #include <unistd.h>
+  #include <string>
+  #include <sched.h>
+#endif
 
 #include "cpu_types.hpp"
 
+#ifdef VLLM_NUMA_DISABLED
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  return std::string(
+      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
+      "no effect to setup thread affinity.");
+}
+
+#endif
+
+#ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
   bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   omp_lock_t writelock;
   omp_init_lock(&writelock);
 
-#pragma omp parallel for schedule(static, 1)
+  #pragma omp parallel for schedule(static, 1)
   for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
     cpu_set_t mask;
     CPU_ZERO(&mask);
@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
+#endif
\ No newline at end of file
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
new file mode 100644
index 0000000000000..b55e4384d064d
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu-apple.md
@@ -0,0 +1,51 @@
+(installation-apple)=
+
+# Installation for macOS
+
+vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
+
+Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
+
+## Requirements
+
+- **Operating System**: `macOS Sonoma` or later
+- **SDK** `XCode 15.4` or later with Command Line Tools
+- **Compilers**: `Apple Clang >= 15.0.0`
+
+<!-- (arm-backend-quick-start-dockerfile)= -->
+
+## Build and installation
+
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+
+```
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -r requirements-cpu.txt
+$ pip install -e . 
+```
+
+```{note}
+On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+```
+
+
+
+## Troubleshooting
+
+If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your 
+[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
+
+```
+[...] fatal error: 'map' file not found
+          1 | #include <map>
+            |          ^~~~~
+      1 error generated.
+      [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
+
+[...] fatal error: 'cstddef' file not found
+         10 | #include <cstddef>
+            |          ^~~~~~~~~
+      1 error generated.
+```
+
diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
index a46e2c010600d..e199073ed721f 100644
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@@ -2,7 +2,7 @@
 
 # Installation for ARM CPUs
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
 
 - CPU backend inference capabilities
 - Relevant runtime environment variables
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
+- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 83de1aff409b2..0ebadca2ccec9 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -11,6 +11,7 @@ gpu-cuda
 gpu-rocm
 cpu-x86
 cpu-arm
+cpu-apple
 hpu-gaudi
 tpu
 xpu
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index e62f313297762..056fbf5a7adec 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.5.1; platform_machine == "aarch64"
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
+torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
-datasets # for benchmark scripts
\ No newline at end of file
+datasets # for benchmark scripts
diff --git a/setup.py b/setup.py
index ef9f4e579e84d..b6c1f5bc8ac3f 100644
--- a/setup.py
+++ b/setup.py
@@ -34,9 +34,14 @@ def load_module_from_path(module_name, path):
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
-if not sys.platform.startswith("linux"):
+if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
     logger.warning(
-        "vLLM only supports Linux platform (including WSL). "
+        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    VLLM_TARGET_DEVICE = "cpu"
+elif not (sys.platform.startswith("linux")
+          or sys.platform.startswith("darwin")):
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL) and MacOS."
         "Building on %s, "
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
diff --git a/vllm/config.py b/vllm/config.py
index 44426489f686a..535cbe97a311a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,6 +4,7 @@
 import hashlib
 import json
 import os
+import sys
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype(
                     "supported for POWERPC.")
                 torch_dtype = torch.bfloat16
 
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            if (current_platform.is_cpu() and sys.platform.startswith("darwin")
+                    and current_platform.get_cpu_architecture()
+                    == CpuArchEnum.ARM and config_dtype == torch.bfloat16):
+                logger.info("For macOS with Apple Silicon, currently bfloat16 "
+                            "is not supported. Setting dtype to float16.")
+                torch_dtype = torch.float16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 047f699e4f277..bc1471e1f534d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,6 +7,7 @@
 import re
 import signal
 import socket
+import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -805,6 +806,8 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
+            # Workaround to work on macOS
+            fd=sock.fileno() if sys.platform.startswith("darwin") else None,
             **uvicorn_kwargs,
         )
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 2660b53d7bfb0..c09cae70e9af8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -524,6 +524,13 @@ def get_open_port() -> int:
 
 
 def find_process_using_port(port: int) -> Optional[psutil.Process]:
+    # TODO: We can not check for running processes with network
+    # port on macOS. Therefore, we can not have a full graceful shutdown
+    # of vLLM. For now, let's not look for processes in this case.
+    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
+    if sys.platform.startswith("darwin"):
+        return None
+
     for conn in psutil.net_connections():
         if conn.laddr.port == port:
             try:

From f12141170a95ad866b3c55762623bc718994e1d7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 18:46:43 +0800
Subject: [PATCH 14/55] [torch.compile] consider relevant code in compilation
 cache (#11614)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py   | 70 ++++++++++++++++++++++++++++++----
 vllm/compilation/decorators.py | 28 +++++++++++++-
 vllm/config.py                 | 29 ++------------
 vllm/sequence.py               |  7 ++++
 4 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a8dd628b9cd6f..87655530cead4 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -145,6 +145,7 @@ def wrap_inductor(graph: fx.GraphModule,
                   example_inputs,
                   additional_inductor_config,
                   compilation_config: CompilationConfig,
+                  vllm_backend: "VllmBackend",
                   graph_index: int = 0,
                   num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
@@ -176,7 +177,7 @@ def wrap_inductor(graph: fx.GraphModule,
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
 
-    cache_data = compilation_config.inductor_hash_cache
+    cache_data = vllm_backend.inductor_hash_cache
     if (runtime_shape, graph_index) in cache_data:
         # we compiled this graph before
         # so we can directly lookup the compiled graph via hash
@@ -196,7 +197,7 @@ def wrap_inductor(graph: fx.GraphModule,
                 hash_str, example_inputs, True, False)
             assert inductor_compiled_graph is not None, (
                 "Inductor cache lookup failed. Please remove"
-                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+                f"the cache file {cache_data.cache_file_path} and try again."  # noqa
             )
 
         # Inductor calling convention (function signature):
@@ -354,7 +355,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 
     def __init__(self, module: torch.fx.GraphModule,
                  compile_submod_names: List[str], vllm_config: VllmConfig,
-                 graph_pool):
+                 graph_pool, vllm_backend: "VllmBackend"):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
@@ -362,6 +363,7 @@ def __init__(self, module: torch.fx.GraphModule,
         self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.vllm_config = vllm_config
+        self.vllm_backend = vllm_backend
 
     def run(self, *args):
         fake_args = [
@@ -389,6 +391,7 @@ def call_module(self, target: torch.fx.node.Target,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
@@ -397,7 +400,7 @@ def call_module(self, target: torch.fx.node.Target,
             self.module.__dict__[target] = PiecewiseBackend(
                 submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
-                compiled_graph_for_general_shape)
+                compiled_graph_for_general_shape, self.vllm_backend)
 
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
@@ -430,6 +433,7 @@ class VllmBackend:
     post_grad_passes: Sequence[Callable]
     sym_tensor_indices: List[int]
     input_buffers: List[torch.Tensor]
+    inductor_hash_cache: InductorHashCache
 
     def __init__(
         self,
@@ -472,6 +476,53 @@ def configure_post_pass(self):
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        if not self.compilation_config.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+
+            # 1. factors come from the vllm_config (it mainly summarizes how the
+            #    model is created)
+            vllm_config = self.vllm_config
+            config_hash = vllm_config.compute_hash()
+
+            # 2. factors come from the code files that are traced by Dynamo (
+            #    it mainly summarizes how the model is used in forward pass)
+            forward_code_files = list(
+                sorted(self.compilation_config.traced_files))
+            self.compilation_config.traced_files.clear()
+            logger.debug(
+                "Traced files (to be considered for compilation cache):\n%s",
+                "\n".join(forward_code_files))
+            hash_content = []
+            for filepath in forward_code_files:
+                hash_content.append(filepath)
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            import hashlib
+            code_hash = hashlib.md5(
+                "\n".join(hash_content).encode()).hexdigest()
+
+            # combine the two hashes to generate the cache dir
+            hash_key = hashlib.md5(
+                f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+        else:
+            cache_dir = self.compilation_config.cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+
+        disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+        self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+            cache_dir, disabled=disabled)
+        if disabled:
+            logger.info("vLLM's torch.compile cache is disabled.")
+        else:
+            logger.info("Using cache directory: %s for vLLM's torch.compile",
+                        cache_dir)
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
@@ -507,8 +558,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.vllm_config,
-                                    self.graph_pool).run(*example_inputs)
+                                    self.vllm_config, self.graph_pool,
+                                    self).run(*example_inputs)
 
         self._called = True
 
@@ -577,7 +628,8 @@ class PiecewiseBackend:
     def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                  graph_pool: Any, piecewise_compile_index: int,
                  total_piecewise_compiles: int, sym_shape_indices: List[int],
-                 compiled_graph_for_general_shape: Callable):
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend):
         """
         The backend for piecewise compilation.
         It mainly handles the compilation and cudagraph capturing.
@@ -597,6 +649,7 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
+        self.vllm_backend = vllm_backend
 
         self.is_first_graph = piecewise_compile_index == 0
         self.is_last_graph = (
@@ -634,7 +687,7 @@ def check_for_ending_compilation(self):
         if self.is_last_graph and not self.to_be_compiled_sizes:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
-            self.compilation_config.inductor_hash_cache.save_to_file()
+            self.vllm_backend.inductor_hash_cache.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
     def __call__(self, *args) -> Any:
@@ -662,6 +715,7 @@ def __call__(self, *args) -> Any:
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 805a217ee6ca1..10513111ea7f1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,8 +1,10 @@
 import inspect
 from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
+from unittest.mock import patch
 
 import torch
 import torch.nn as nn
+from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
@@ -196,7 +198,31 @@ def __call__(self, *args, **kwargs):
             # we need to control all the compilation of the model.
             torch._dynamo.eval_frame.remove_from_cache(
                 self.original_code_object)
-            return self.compiled_callable(*args, **kwargs)
+
+            # collect all relevant files traced by Dynamo,
+            # so that the compilation cache can trigger re-compilation
+            # properly when any of these files change.
+
+            # 1. the file containing the top-level forward function
+            self.vllm_config.compilation_config.traced_files.add(
+                self.original_code_object.co_filename)
+
+            # 2. every time Dynamo sees a function call, it will inline
+            # the function by calling InliningInstructionTranslator.inline_call
+            # we hijack this function to know all the functions called
+            # during Dynamo tracing, and their corresponding files
+            inline_call = InliningInstructionTranslator.inline_call
+
+            def patched_inline_call(parent, func, args, kwargs):
+                code = func.get_code()
+                self.vllm_config.compilation_config.traced_files.add(
+                    code.co_filename)
+                return inline_call(parent, func, args, kwargs)
+
+            with patch.object(InliningInstructionTranslator, 'inline_call',
+                              patched_inline_call):
+                output = self.compiled_callable(*args, **kwargs)
+            return output
 
         # usually, capturing the model once is enough, and then we can
         # dispatch to the compiled code directly, without going through
diff --git a/vllm/config.py b/vllm/config.py
index 535cbe97a311a..6dabeb3861af2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,7 +3,6 @@
 import enum
 import hashlib
 import json
-import os
 import sys
 import warnings
 from contextlib import contextmanager
@@ -2778,9 +2777,8 @@ def model_post_init(self, __context: Any) -> None:
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
+    traced_files: Set[str] = PrivateAttr
     compilation_time: float = PrivateAttr
-    # should be InductorHashCache, but Pydantic does not support it
-    inductor_hash_cache: Any = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
@@ -2818,6 +2816,7 @@ def __repr__(self) -> str:
             "compilation_time",
             "bs_to_padded_graph_size",
             "pass_config",
+            "traced_files",
         }
         return self.model_dump_json(exclude=exclude, exclude_unset=True)
 
@@ -2877,6 +2876,7 @@ def model_post_init(self, __context: Any) -> None:
 
         self.enabled_custom_ops = Counter()
         self.disabled_custom_ops = Counter()
+        self.traced_files = set()
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
@@ -2899,29 +2899,6 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
 
-        if not self.cache_dir:
-            # no provided cache dir, generate one based on the known factors
-            # that affects the compilation. if none of the factors change,
-            # the cache dir will be the same so that we can reuse the compiled
-            # graph.
-            hash_key = vllm_config.compute_hash()
-            cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
-                f"rank_{vllm_config.parallel_config.rank}")
-            os.makedirs(cache_dir, exist_ok=True)
-            self.cache_dir = cache_dir
-
-            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
-            from vllm.compilation.backends import InductorHashCache
-            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
-                self.cache_dir, disabled=disabled)
-            if disabled:
-                logger.info("vLLM's torch.compile cache is disabled.")
-            else:
-                logger.info(
-                    "Using cache directory: %s for vLLM's torch.compile",
-                    self.cache_dir)
-
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(vllm_config)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0157abbd2eed5..5857f656dfc10 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1108,6 +1108,13 @@ class IntermediateTensors:
 
     tensors: Dict[str, torch.Tensor]
 
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
     def __getitem__(self, key: Union[str, slice]):
         if isinstance(key, str):
             return self.tensors[key]

From 2a0596bc480bb835dc05a30f5e708ecbfffbcd69 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 18:59:58 +0800
Subject: [PATCH 15/55] [VLM] Reorganize profiling/processing-related code
 (#11812)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  41 ++--
 .../processing/test_llava_onevision.py        |  41 ++--
 .../vision_language/processing/test_phi3v.py  |  24 +-
 .../processing/test_qwen2_vl.py               |  22 +-
 tests/multimodal/test_processing.py           |  52 ++---
 .../vllm_add_dummy_model/my_llava.py          |  10 +-
 vllm/inputs/preprocess.py                     |   2 +-
 vllm/inputs/registry.py                       |   4 +-
 vllm/model_executor/models/aria.py            |  47 ++--
 vllm/model_executor/models/blip2.py           |  39 ++--
 vllm/model_executor/models/chameleon.py       |  47 ++--
 vllm/model_executor/models/fuyu.py            |  80 +++----
 vllm/model_executor/models/llava.py           | 175 +++++++-------
 vllm/model_executor/models/llava_next.py      |  55 +++--
 .../model_executor/models/llava_next_video.py | 104 +++++----
 vllm/model_executor/models/llava_onevision.py | 115 +++++----
 vllm/model_executor/models/phi3v.py           |  83 +++----
 vllm/model_executor/models/qwen2_audio.py     |  49 ++--
 vllm/model_executor/models/qwen2_vl.py        | 113 +++++----
 vllm/model_executor/models/ultravox.py        |  46 ++--
 vllm/multimodal/processing.py                 | 219 +++++++-----------
 vllm/multimodal/profiling.py                  | 152 +++++++++---
 vllm/multimodal/registry.py                   |  73 +++++-
 23 files changed, 833 insertions(+), 760 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 9fa6a8a10a0f9..689d17be81889 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_next():
-    from vllm.model_executor.models.llava_next import (
-        LlavaNextMultiModalProcessor)
-    return LlavaNextMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -78,20 +71,17 @@ def _test_image_prompt_replacements(
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
                   "Comment this out to run it manually.")
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index d4cdffa210b6d..a033354f0e9b8 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_onevision():
-    from vllm.model_executor.models.llava_onevision import (
-        LlavaOnevisionMultiModalProcessor)
-    return LlavaOnevisionMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -77,20 +70,17 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
index 249045b3c04ce..c5b77260c6544 100644
--- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
@@ -1,21 +1,13 @@
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
-from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
-    return Phi3VMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -29,7 +21,6 @@ def processor_for_phi3v():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_phi3v,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, int],
@@ -37,21 +28,26 @@ def test_processor_override(
     num_imgs: int,
 ):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_phi3v(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
index b9ac887edf90f..0d54802f2b733 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
@@ -1,19 +1,12 @@
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
-    return Qwen2VLMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -24,7 +17,6 @@ def processor_for_qwen2_vl():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_qwen2_vl,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, object],
@@ -39,18 +31,20 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_qwen2_vl(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
-    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
     pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 75d878217b657..d98bd9736b65f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -10,12 +10,17 @@
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        _PlaceholderInfo, find_mm_placeholders,
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache,
+                                        PromptReplacement,
+                                        find_mm_placeholders,
                                         find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+# yapf: enable
+from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
@@ -431,7 +436,7 @@ def test_find_replace_tokens(
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=6,
@@ -445,13 +450,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
@@ -459,7 +464,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=7,
@@ -472,13 +477,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
@@ -486,7 +491,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=6,
@@ -577,19 +582,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
-
-    processor = processor_factory(ctx, cache=None)
-    profiler = processor.profiling_info
+    profiler = MultiModalProfiler(processor)
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
-    profiler.get_supported_mm_limits = mock_supported_mm_limits
+    processor.info.get_supported_mm_limits = mock_supported_mm_limits
 
     if is_valid:
         exc_ctx = nullcontext()
@@ -597,7 +598,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        profiler.get_mm_limits()
+        profiler.get_dummy_data(model_config.max_model_len)
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@@ -620,16 +621,12 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
 
-    processor = processor_factory(ctx, cache=None)
-
     rng = np.random.RandomState(0)
     image = _rand_img(rng, min_wh=128, max_wh=256)
     if num_images == 0:
@@ -681,9 +678,9 @@ def _test_processing_cache_correctness(
         hf_overrides=hf_overrides,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
@@ -691,8 +688,9 @@ def _test_processing_cache_correctness(
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
 
-    baseline_processor = processor_factory(ctx, cache=None)
-    cached_processor = processor_factory(ctx, cache=cache)
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
 
     rng = np.random.RandomState(0)
 
@@ -724,7 +722,7 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
+        prompt = dummy_inputs.get_dummy_processor_inputs(
             model_config.max_model_len,
             mm_counts,
         ).prompt_text
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 06dfebbb95527..ac64edfd4ec9d 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -2,13 +2,17 @@
 
 import torch
 
-from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor)
+from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
+                                              LlavaForConditionalGeneration,
+                                              LlavaMultiModalProcessor,
+                                              LlavaProcessingInfo)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b362ee0cac328..6ddc1eb76f10d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -7,7 +7,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.utils import print_info_once, print_warning_once
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 2d9d024e03e80..b22b3f1594f24 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -323,6 +323,7 @@ def dummy_data_for_profiling(
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
+        from vllm.multimodal.profiling import MultiModalProfiler
         from vllm.multimodal.utils import cached_get_tokenizer
 
         if mm_registry.has_processor(model_config):
@@ -331,7 +332,8 @@ def dummy_data_for_profiling(
                 trust_remote_code=model_config.trust_remote_code,
             )
             processor = mm_registry.create_processor(model_config, tokenizer)
-            dummy_data = processor.get_dummy_data(seq_len)
+            profiler = MultiModalProfiler(processor)
+            dummy_data = profiler.get_dummy_data(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 2e649f10c0765..089062ab53fc3 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -23,10 +23,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
@@ -445,33 +445,33 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-class AriaProcessingMixin(ProcessingMixin):
+class AriaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config()
 
-    def _get_vision_config(self) -> AriaVisionConfig:
-        return self._get_hf_config().vision_config
-
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return max(hf_config.projector_patch_to_query_dict.values())
-
-
-class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo):
+    def get_vision_config(self) -> AriaVisionConfig:
+        return self.get_hf_config().vision_config
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return max(hf_config.projector_patch_to_query_dict.values())
+
+
+class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        vision_config = self._get_vision_config()
+        vision_config = self.info.get_vision_config()
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
@@ -483,7 +483,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token  # type: ignore
 
         return ProcessorInputs(
@@ -492,10 +492,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return AriaProfilingInfo(self.ctx)
+class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -513,10 +510,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -527,7 +524,9 @@ def _get_prompt_replacements(
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor,
+                                        info=AriaProcessingInfo,
+                                        dummy_inputs=AriaDummyInputsBuilder)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index fd45783f167b4..7dfc0b687c6e3 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -17,10 +17,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
@@ -397,30 +397,30 @@ def forward(
         return sequence_output
 
 
-class Blip2ProcessingMixin(ProcessingMixin):
+class Blip2ProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Blip2Config)
 
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return hf_config.num_query_tokens
-
-
-class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return hf_config.num_query_tokens
+
+
+class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         max_image_size = vision_config.image_size
@@ -439,10 +439,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Blip2ProfilingInfo(self.ctx)
+class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -460,7 +457,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -491,7 +488,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
+                                        info=Blip2ProcessingInfo,
+                                        dummy_inputs=Blip2DummyInputsBuilder)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 73ed73b61ebf9..acff926891bbe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -30,10 +30,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
@@ -49,33 +49,34 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-class ChameleonProcessingMixin(ProcessingMixin):
+class ChameleonProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(ChameleonConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(ChameleonProcessor)
 
-    def _get_num_image_tokens(self) -> int:
-        processor = self._get_hf_processor()
-        return processor.image_seq_length
-
-
-class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        return processor.image_seq_length
+
+
+class ChameleonDummyInputsBuilder(
+        BaseDummyInputsBuilder[ChameleonProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        config = self._get_hf_config()
+        config = self.info.get_hf_config()
 
         width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
@@ -93,11 +94,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class ChameleonMultiModalProcessor(ChameleonProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return ChameleonProfilingInfo(self.ctx)
+class ChameleonMultiModalProcessor(
+        BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -112,7 +110,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         return [
             PromptReplacement(
@@ -120,7 +118,7 @@ def _get_prompt_replacements(
                 target="<image>",
                 replacement="".join([
                     processor.image_start_token,
-                    processor.image_token * self._get_num_image_tokens(),
+                    processor.image_token * self.info.get_num_image_tokens(),
                     processor.image_end_token,
                 ]),
             )
@@ -916,7 +914,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    ChameleonMultiModalProcessor,
+    info=ChameleonProcessingInfo,
+    dummy_inputs=ChameleonDummyInputsBuilder)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index c937fcb0978b9..59af5f0b3ae98 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -33,11 +33,11 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -64,24 +64,38 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-class FuyuProcessingMixin(ProcessingMixin):
+class FuyuProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(FuyuConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(FuyuProcessor)
 
-    def _get_image_processor(self) -> FuyuImageProcessor:
-        return self._get_hf_processor().image_processor
+    def get_image_processor(self) -> FuyuImageProcessor:
+        return self.get_hf_processor().image_processor
 
-    def _get_image_feature_grid_size(
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_ncols, max_nrows = self.get_image_feature_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
+
+        return {"image": max_image_tokens}
+
+    def get_image_feature_grid_size(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> tuple[int, int]:
-        image_processor = self._get_image_processor()
+        image_processor = self.get_image_processor()
         target_width = image_processor.size["width"]
         target_height = image_processor.size["height"]
 
@@ -97,34 +111,21 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
-
-class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self._get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        image_processor = self._get_image_processor()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
+
+class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
@@ -140,10 +141,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return FuyuProfilingInfo(self.ctx)
+class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -156,7 +154,7 @@ def _call_hf_processor(
             # Avoid warning from HF logger for text-only input
             # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
             # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
             prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -196,10 +194,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         eot_token_id = tokenizer.bos_token_id
         assert isinstance(eot_token_id, int)
 
@@ -207,7 +205,7 @@ def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            ncols, nrows = self._get_image_feature_grid_size(
+            ncols, nrows = self.info.get_image_feature_grid_size(
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
@@ -244,7 +242,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
+                                        info=FuyuProcessingInfo,
+                                        dummy_inputs=FuyuDummyInputsBuilder)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 305f1364dba23..8d94acf3b21d5 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,7 +1,7 @@
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -25,11 +25,11 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingCache,
-                                        ProcessingMixin, PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -105,34 +105,23 @@ class LlavaLikeProcessor(Protocol):
     image_token: Final[str]
 
 
-class BaseLlavaProcessingMixin(ProcessingMixin, ABC):
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaLikeConfig:
+    def get_hf_config(self) -> LlavaLikeConfig:
         return self.ctx.get_hf_config(LlavaConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
     @abstractmethod
-    def _get_hf_processor(self) -> LlavaLikeProcessor:
+    def get_hf_processor(self) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            vision_encoder_info.get_num_image_tokens(
-                image_width=image_width,
-                image_height=image_height,
-            ),
-        )
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
 
     def _apply_feature_select_strategy(
         self,
@@ -147,28 +136,42 @@ def _apply_feature_select_strategy(
         msg = f"Unexpected feature select strategy: {strategy!r}"
         raise NotImplementedError(msg)
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
-class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
         width = height = vision_encoder_info.get_image_size()
         return ImageSize(width=width, height=height)
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
         )
 
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -176,9 +179,10 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -193,23 +197,13 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaProcessor)
 
 
-class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class BaseLlavaMultiModalProcessor(LlavaProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    # Copied from BaseMultiModalProcessor
-    @abstractmethod
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        raise NotImplementedError
+class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     # Copied from BaseMultiModalProcessor
     @abstractmethod
@@ -226,7 +220,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         def get_replacement(item_idx: int):
@@ -237,7 +231,7 @@ def get_replacement(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                 )
@@ -253,10 +247,8 @@ def get_replacement(item_idx: int):
         ]
 
 
-class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaProfilingInfo(self.ctx)
+class LlavaMultiModalProcessor(
+        BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -269,21 +261,14 @@ def _get_mm_fields_config(
         )
 
 
-class PixtralHFProcessingMixin(BaseLlavaProcessingMixin):
+class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(PixtralProcessor)
 
 
-class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return PixtralHFProfilingInfo(self.ctx)
+class PixtralHFMultiModalProcessor(
+        BaseMultiModalProcessor[PixtralHFProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -328,10 +313,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         image_break_token = processor.image_break_token
         image_end_token = processor.image_end_token
@@ -363,26 +348,40 @@ def get_replacement(item_idx: int):
         ]
 
 
+def _build_llava_or_pixtral_hf_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(LlavaConfig)
+
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFProcessingInfo(ctx)
+
+    return LlavaProcessingInfo(ctx)
+
+
 def _build_llava_or_pixtral_hf_processor(
-    ctx: InputProcessingContext,
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
     cache: Optional[ProcessingCache] = None,
     enable_sanity_checks: bool = True,
 ) -> BaseMultiModalProcessor:
-    hf_config = ctx.get_hf_config(LlavaConfig)
-
-    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+    if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
-            ctx,
+            info,
+            dummy_inputs,  # type: ignore
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+    if isinstance(info, LlavaProcessingInfo):
+        return LlavaMultiModalProcessor(
+            info,
+            dummy_inputs,  # type: ignore
             cache=cache,
             enable_sanity_checks=enable_sanity_checks,
         )
 
-    return LlavaMultiModalProcessor(
-        ctx,
-        cache=cache,
-        enable_sanity_checks=enable_sanity_checks,
-    )
+    raise NotImplementedError(type(info))
 
 
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
@@ -460,7 +459,9 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor)
+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor,
+                                        info=_build_llava_or_pixtral_hf_info,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -727,11 +728,11 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         # Assume that it doesn't depend on the image size
-        num_image_tokens = self._get_num_image_tokens(
+        num_image_tokens = self.info.get_num_image_tokens(
             image_width=-1,
             image_height=-1,
         )
@@ -796,6 +797,8 @@ def get_replacement_mantis(item_idx: int):
 
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 815456dac2a2f..fda4f22d366b1 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,6 +1,7 @@
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -16,13 +17,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import ImageSize
-from vllm.multimodal.profiling import BaseProfilingInfo
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin,
-                    BaseLlavaProfilingInfo, LlavaLikeConfig,
+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
+                    LlavaDummyInputsBuilder, LlavaLikeConfig,
                     LlavaMultiModalProjector, init_vision_tower_for_llava)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
@@ -65,23 +65,23 @@ class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
     image_grid_pinpoints: Final[list[list[int]]]
 
 
-class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaNextLikeConfig:
+    def get_hf_config(self) -> LlavaNextLikeConfig:
         return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
@@ -140,16 +140,13 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
-
-class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
 
         largest_feature_size, largest_feature_pinpoint = 0, None
         for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
+            feat_size = self.get_num_image_tokens(image_width=width,
+                                                  image_height=height)
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
                 largest_feature_pinpoint = ImageSize(width=width,
@@ -161,11 +158,23 @@ def _get_image_size_with_most_features(self) -> ImageSize:
         return largest_feature_pinpoint
 
 
-class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin,
-                                   BaseLlavaMultiModalProcessor):
+_I = TypeVar("_I", bound=LlavaNextProcessingInfo)
+
+
+class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
+
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextProfilingInfo(self.ctx)
+class LlavaNextMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -179,7 +188,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
+                                        info=LlavaNextProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 6e82cee1c95a4..5be85d7c0f033 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -17,12 +17,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -47,33 +46,52 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-class LlavaNextVideoProcessingMixin(ProcessingMixin):
+class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_video_tokens = self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+        )
+
+        return {"video": max_video_tokens}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
     def _get_num_frame_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = hf_config.spatial_pool_stride
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -87,37 +105,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames
 
-
-class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin,
-                                  BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"video": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_video_tokens = self._get_num_video_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
-        )
-
-        return {"video": max_video_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
-        width = height = vision_encoder_info.get_image_size()
-        return ImageSize(width=width, height=height)
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -130,7 +125,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
@@ -138,6 +133,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
+
+class LlavaNextVideoDummyInputsBuilder(
+        BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -145,16 +144,20 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "video":
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -165,11 +168,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin,
-                                        BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextVideoProfilingInfo(self.ctx)
+class LlavaNextVideoMultiModalProcessor(
+        BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -184,7 +184,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_replacement(item_idx: int):
@@ -195,7 +195,7 @@ def get_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -269,7 +269,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextVideoMultiModalProcessor,
+    info=LlavaNextVideoProcessingInfo,
+    dummy_inputs=LlavaNextVideoDummyInputsBuilder,
+)
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index b5e3edba1f01c..78a47e64d9afc 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -17,19 +17,20 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import PromptReplacement
+from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava
-from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor,
-                         LlavaNextProcessingMixin)
+from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
+from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
+                         LlavaNextProcessingInfo)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -89,14 +90,23 @@ class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
     video_token_index: Final[int]
 
 
-class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
+class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
+    def get_hf_config(self) -> LlavaOnevisionLikeConfig:
         return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
@@ -141,16 +151,16 @@ def _get_num_frame_tokens(
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -164,43 +174,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames + 1  # Newline token
 
-
-class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
-                                  BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
-        largest_feature_size, largest_feature_pinpoint = 0, None
-        for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
-            if feat_size > largest_feature_size:
-                largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
-
-        if largest_feature_size == 0 or largest_feature_pinpoint is None:
-            raise ValueError("Cannot have a largest feature size of 0!")
-
-        return largest_feature_pinpoint
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -213,12 +194,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
@@ -226,15 +207,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_frames_per_video, 1)
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
         )
 
+
+class LlavaOnevisionDummyInputsBuilder(
+        LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -243,10 +228,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -257,7 +246,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -268,11 +257,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin,
-                                        LlavaNextMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaOnevisionProfilingInfo(self.ctx)
+class LlavaOnevisionMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -303,7 +289,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
 
         # LLaVA-OneVision processor doesn't support multiple videos
@@ -345,7 +331,7 @@ def _get_prompt_replacements(
             out_mm_kwargs=out_mm_kwargs,
         )
 
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_video_replacement(item_idx: int):
@@ -356,7 +342,7 @@ def get_video_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -393,7 +379,10 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaOnevisionMultiModalProcessor,
+    info=LlavaOnevisionProcessingInfo,
+    dummy_inputs=LlavaOnevisionDummyInputsBuilder)
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index c8418c14e5fdf..a1b1af35604db 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,13 +34,12 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement,
-                                        _BoundPromptReplacement,
-                                        _PlaceholderInfo)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo,
+                                        BoundPromptReplacement,
+                                        PlaceholderInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -302,9 +301,9 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-class Phi3VProcessingMixin(ProcessingMixin):
+class Phi3VProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         num_crops: Optional[int] = None,
@@ -314,39 +313,42 @@ def _get_hf_processor(
 
         return self.ctx.get_hf_processor()
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        processor = self._get_hf_processor()
-
-        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
-            width=image_width,
-            height=image_height,
-        )
-
-
-class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        max_image_tokens = self._get_num_image_tokens(
+        max_image_tokens = self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            processor=None,
         )
 
         return {"image": max_image_tokens}
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
+            width=image_width,
+            height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
         # Result in the max possible feature size (h:w = 16:1)
         return ImageSize(height=8000, width=50)
 
+
+class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -354,7 +356,8 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -363,7 +366,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
         return ProcessorInputs(
@@ -372,10 +375,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Phi3VProfilingInfo(self.ctx)
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -416,10 +416,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
@@ -431,9 +431,10 @@ def get_replacement_phi3v(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
+                    processor=hf_processor,
                 )
 
             return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
@@ -451,9 +452,9 @@ def get_replacement_phi3v(item_idx: int):
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
         token_ids, text, placeholders = super()._apply_prompt_replacements(
             token_ids=token_ids,
             mm_prompt_repls=mm_prompt_repls,
@@ -466,7 +467,7 @@ def _apply_prompt_replacements(
             token_ids = [token_ids[0], *token_ids[2:]]
             placeholders = {
                 modality: [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality=p.modality,
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
@@ -499,7 +500,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
+                                        info=Phi3VProcessingInfo,
+                                        dummy_inputs=Phi3VDummyInputsBuilder)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 7012ddc66cd9c..0dff9595c6c08 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -38,11 +38,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -80,12 +80,12 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-class Qwen2AudioProcessingMixin(ProcessingMixin):
+class Qwen2AudioProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2AudioConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -93,36 +93,37 @@ def _get_hf_processor(
     ) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
 
         return {"audio": max_output_lengths}
 
+
+class Qwen2AudioDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -139,14 +140,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin,
-                                    BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2AudioProfilingInfo(self.ctx)
+class Qwen2AudioMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -161,7 +159,7 @@ def _call_hf_processor(
         if audios:
             mm_data["audios"] = audios
 
-            feature_extractor = self._get_feature_extractor(**mm_kwargs)
+            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
             mm_kwargs = dict(
                 **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
@@ -194,7 +192,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         placeholder = hf_config.audio_token_index
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
@@ -234,10 +232,13 @@ def _always_apply_prompt_replacements(self) -> bool:
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return not hasattr(self._get_hf_processor(), "audio_token")
+        return not hasattr(self.info.get_hf_processor(), "audio_token")
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2AudioMultiModalProcessor,
+    info=Qwen2AudioProcessingInfo,
+    dummy_inputs=Qwen2AudioDummyInputsBuilder)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a5c2fb9e84df3..8537fec854b6d 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -57,11 +57,10 @@
                                     MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors, VideoItem)
 from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
-                                   MultiModalDataParser)
+                                   MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -709,12 +708,12 @@ def _parse_video_data(
         return super()._parse_video_data(data)
 
 
-class Qwen2VLProcessingMixin(ProcessingMixin):
+class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
@@ -736,18 +735,27 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_image_processor(
+    def get_image_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
     ):
-        hf_processor = self._get_hf_processor(min_pixels=min_pixels,
-                                              max_pixels=max_pixels)
+        hf_processor = self.get_hf_processor(min_pixels=min_pixels,
+                                             max_pixels=max_pixels)
         image_processor = hf_processor.image_processor  # type: ignore
         assert isinstance(image_processor, Qwen2VLImageProcessor)
         return image_processor
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     def _get_vision_info(
         self,
         *,
@@ -755,15 +763,17 @@ def _get_vision_info(
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> tuple[ImageSize, int]:
-        hf_config = self._get_hf_config()
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
-        image_processor = self._get_image_processor()
-
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
@@ -787,70 +797,65 @@ def _get_vision_info(
 
         return preprocessed_size, num_vision_tokens
 
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
+            image_processor=image_processor,
         )
         return num_image_tokens
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
         num_frames: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=num_frames,
+            image_processor=image_processor,
         )
         return num_video_tokens
 
-
-class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_image_size_with_most_features(self) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
             image_width=9999999,
             image_height=9999999,
+            image_processor=None,
         )
         return max_image_size
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            image_processor=None,
         )
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
+                image_processor=None,
             )
 
             if next_max_tokens > max_tokens:
@@ -860,12 +865,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
 
@@ -877,15 +882,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return num_frames
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+            image_processor=None,
         )
 
+
+class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -894,10 +903,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -908,7 +921,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -919,11 +932,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin,
-                                 BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2VLProfilingInfo(self.ctx)
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
+                                 ):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
@@ -934,8 +944,9 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self._get_image_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -991,7 +1002,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
+                                        info=Qwen2VLProcessingInfo,
+                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ecafd157b1d61..fada22d685dd6 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -24,11 +24,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -59,9 +58,9 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-class UltravoxProcessingMixin(ProcessingMixin):
+class UltravoxProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -76,37 +75,38 @@ def _get_hf_processor(
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
         return hf_processor
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         audio_processor = hf_processor.audio_processor  # type: ignore
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.get_feature_extractor()
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
 
         return {"audio": max_audio_tokens}
 
+
+class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
+                                 ):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -123,14 +123,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class UltravoxMultiModalProcessor(UltravoxProcessingMixin,
-                                  BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return UltravoxProfilingInfo(self.ctx)
+class UltravoxMultiModalProcessor(
+        BaseMultiModalProcessor[UltravoxProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -141,7 +138,7 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
 
             prompt_ids = tokenizer.encode(
                 prompt,
@@ -160,7 +157,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         mm_kwargs = dict(
             **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
@@ -208,7 +205,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
@@ -342,7 +339,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
+                                        info=UltravoxProcessingInfo,
+                                        dummy_inputs=UltravoxDummyInputsBuilder
+                                        )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 41113cd85bd16..c6a30cacebdd1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -4,12 +4,13 @@
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
+from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
+                    TypeVar, Union)
 
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
-from vllm import envs
-from vllm.inputs import DummyData, InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
@@ -20,7 +21,9 @@
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
-from .profiling import BaseProfilingInfo
+
+if TYPE_CHECKING:
+    from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
 
@@ -46,8 +49,8 @@ class PromptReplacement:
     if it does not depend on the input.
     """
 
-    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
-        return _BoundPromptReplacement(
+    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
+        return BoundPromptReplacement(
             tokenizer=tokenizer,
             modality=self.modality,
             _target=self.target,
@@ -128,7 +131,7 @@ def token_ids(self) -> list[int]:
 
 
 @dataclass
-class _BoundPromptReplacement:
+class BoundPromptReplacement:
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -207,7 +210,7 @@ def iter_token_matches(
 
 @dataclass(repr=False)
 class _PromptReplacementMatch(ABC):
-    prompt_repl: _BoundPromptReplacement
+    prompt_repl: BoundPromptReplacement
 
     @property
     def modality(self) -> str:
@@ -255,7 +258,7 @@ def end_idx(self) -> int:
 
 
 @dataclass
-class _PlaceholderInfo:
+class PlaceholderInfo:
     modality: str
     item_idx: int
     start_idx: int
@@ -274,7 +277,7 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTokenMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -286,7 +289,7 @@ def find_token_matches(
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTextMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -390,9 +393,9 @@ def replace_text_matches(
 def _iter_modality_placeholders(
     prompt: list[int],
     modality: str,
-    modality_repls: Sequence[_BoundPromptReplacement],
+    modality_repls: Sequence[BoundPromptReplacement],
     modal_item_count: int,
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     if modal_item_count == 0:
         return
 
@@ -413,7 +416,7 @@ def _iter_modality_placeholders(
                 continue
 
             if prompt[start_idx:end_idx] == repl_tokens:
-                yield _PlaceholderInfo(
+                yield PlaceholderInfo(
                     modality=modality,
                     item_idx=item_idx,
                     start_idx=start_idx,
@@ -434,10 +437,10 @@ def _iter_modality_placeholders(
 
 
 def _iter_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     """
     For each modality, yield each set of placeholder tokens found in
     :code:`prompt`.
@@ -455,10 +458,10 @@ def _iter_placeholders(
 
 
 def find_mm_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Mapping[str, list[_PlaceholderInfo]]:
+) -> Mapping[str, list[PlaceholderInfo]]:
     it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
     return dict(full_groupby_modality(it))
 
@@ -524,29 +527,59 @@ def put(
         self._cache.put(cache_key, output_kwargs)
 
 
-class ProcessingMixin:
-    """
-    Contains helper functions to perform processing.
+class BaseProcessingInfo:
+    """Base class containing information to perform processing."""
 
-    Not to be confused with :class:`transformers.ProcessorMixin`.
-    """
-    ctx: InputProcessingContext
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
 
-    def _get_tokenizer(self) -> AnyTokenizer:
+        self.ctx = ctx
+
+    @property
+    def model_id(self) -> str:
+        return self.ctx.model_config.model
+
+    def get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def _get_hf_config(self) -> PretrainedConfig:
+    def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
 
-    def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         """
         Subclasses can override this method to handle
         specific kwargs from model config or user inputs.
         """
         return self.ctx.get_hf_processor(**kwargs)
 
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
+
+_I = TypeVar("_I", bound=BaseProcessingInfo)
 
-class BaseMultiModalProcessor(ProcessingMixin, ABC):
+
+class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
 
@@ -554,18 +587,19 @@ class BaseMultiModalProcessor(ProcessingMixin, ABC):
     """
 
     def __init__(self,
-                 ctx: InputProcessingContext,
+                 info: _I,
+                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
                  *,
                  cache: Optional[ProcessingCache] = None,
                  enable_sanity_checks: bool = True) -> None:
         super().__init__()
 
-        self.ctx = ctx
+        self.info = info
+        self.dummy_inputs = dummy_inputs
         self.cache = cache
         self.enable_sanity_checks = enable_sanity_checks
 
         self.data_parser = self._get_data_parser()
-        self.profiling_info = self._get_profiling_info()
 
     def __call__(
         self,
@@ -585,13 +619,6 @@ def _get_data_parser(self) -> MultiModalDataParser:
         """
         return MultiModalDataParser()
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        """
-        Get the profiling information to find the worst-case memory usage of
-        the model.
-        """
-        raise NotImplementedError
-
     def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
@@ -602,7 +629,7 @@ def _to_mm_items(
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
 
-        mm_limits = self.ctx.get_mm_config().limit_per_prompt
+        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
             limit = mm_limits.get(modality, 1)
             if len(items) > limit:
@@ -646,19 +673,19 @@ def _get_prompt_replacements(
 
     def _find_mm_placeholders(
         self,
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
-    ) -> Mapping[str, list[_PlaceholderInfo]]:
+    ) -> Mapping[str, list[PlaceholderInfo]]:
         return find_mm_placeholders(mm_prompt_repls, new_token_ids,
                                     mm_item_counts)
 
     def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        processor_data = dict[str, Any]()
-        passthrough_data = dict[str, Any]()
+    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
+        processor_data = dict[str, object]()
+        passthrough_data = dict[str, object]()
 
         for items in mm_items.values():
             processor_data.update(items.get_processor_data())
@@ -678,8 +705,8 @@ def _call_hf_processor(
         Call the HF processor on the prompt text and
         associated multi-modal data.
         """
-        return self.ctx.call_hf_processor(
-            self._get_hf_processor(**mm_kwargs),
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
             mm_kwargs,
         )
@@ -738,8 +765,8 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self.profiling_info.get_dummy_processor_inputs(
-            self.ctx.model_config.max_model_len,
+        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
+            self.info.ctx.model_config.max_model_len,
             mm_missing_counts,
         )
 
@@ -762,7 +789,7 @@ def _cached_apply_hf_processor(
         caching the results and reusing cached results.
         """
         cache = self.cache
-        model_id = self.ctx.model_config.model
+        model_id = self.info.model_id
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
@@ -838,8 +865,8 @@ def _cached_apply_hf_processor(
     def _bind_and_group_repls(
         self,
         prompt_repls: list[PromptReplacement],
-    ) -> dict[str, list[_BoundPromptReplacement]]:
-        tokenizer = self._get_tokenizer()
+    ) -> dict[str, list[BoundPromptReplacement]]:
+        tokenizer = self.info.get_tokenizer()
 
         it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
         return dict(full_groupby_modality(it))
@@ -859,10 +886,10 @@ def _always_apply_prompt_replacements(self) -> bool:
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
-        tokenizer = self._get_tokenizer()
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
+        tokenizer = self.info.get_tokenizer()
 
         mm_token_matches = {
             modality: find_token_matches(token_ids, prompt_repls)
@@ -950,7 +977,7 @@ def _validate_mm_kwargs(
 
     def _validate_mm_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[_PlaceholderInfo]],
+        mm_placeholders: Mapping[str, list[PlaceholderInfo]],
         mm_item_counts: Mapping[str, int],
         *,
         allow_missing: bool = False,
@@ -1001,7 +1028,7 @@ def apply(
         # instead of rehashing.
 
         if envs.VLLM_USE_V1:
-            model_id = self.ctx.model_config.model
+            model_id = self.info.model_id
             mm_hashes = {
                 modality: [
                     MultiModalHasher.hash_kwargs(model_id=model_id,
@@ -1046,7 +1073,7 @@ def apply(
                 allow_missing=True,
             )
 
-            mm_missing_repls = dict[str, list[_BoundPromptReplacement]]()
+            mm_missing_repls = dict[str, list[BoundPromptReplacement]]()
             for modality, missing_repl_count in mm_missing_repl_counts.items():
                 if missing_repl_count == 0:
                     mm_missing_repls[modality] = []
@@ -1059,7 +1086,7 @@ def apply(
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             prompt_text = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
@@ -1090,79 +1117,3 @@ def apply(
             mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
-
-    def _get_dummy_mm_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalInputsV2:
-        profiling = self.profiling_info
-        processor_inputs = profiling.get_dummy_processor_inputs(
-            seq_len, mm_counts)
-
-        return self.apply(
-            prompt_text=processor_inputs.prompt_text,
-            mm_data=processor_inputs.mm_data,
-            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-        )
-
-    def get_dummy_data(self, seq_len: int) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
-        profiling = self.profiling_info
-        mm_counts = profiling.get_mm_limits()
-        mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len)
-        if mm_counts.keys() != mm_max_tokens_per_item.keys():
-            raise AssertionError(
-                "The keys returned by `get_supported_mm_limits`"
-                f"({set(mm_counts.keys())}) should be the same as those "
-                "returned by `get_mm_max_tokens_per_item` "
-                f"({set(mm_max_tokens_per_item.keys())})")
-
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        prompt_token_ids = mm_inputs["prompt_token_ids"]
-        placeholders_by_modality = mm_inputs["mm_placeholders"]
-
-        total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
-            for modality, placeholders in placeholders_by_modality.items()
-        }
-        expected_placeholders_by_modality = {
-            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
-            for modality in placeholders_by_modality
-        }
-        if total_placeholders_by_modality != expected_placeholders_by_modality:
-            raise AssertionError(
-                f"The processed dummy data has a total of "
-                f"{total_placeholders_by_modality} placeholder tokens, which "
-                f"is not the expected {expected_placeholders_by_modality} "
-                "tokens.")
-
-        total_len = len(prompt_token_ids)
-
-        # V0 does not support chunked prefill.
-        if total_len > seq_len and not envs.VLLM_USE_V1:
-            logger.warning(
-                "The context length (%d) of the model is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
-                "multi-modal embeddings). This may cause certain multi-modal "
-                "inputs to fail during inference, even when the input text is "
-                "short. To avoid this, you should increase `max_model_len`, "
-                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
-                total_len, total_placeholders_by_modality)
-
-            return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
-                multi_modal_data=None,
-                multi_modal_placeholders=None,
-            )
-
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
-
-        return DummyData(
-            seq_data=SequenceData.from_seqs(prompt_token_ids),
-            multi_modal_data=mm_inputs["mm_kwargs"],
-            multi_modal_placeholders=placeholders_by_modality,
-        )
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ecf0db1a485d..2ac3a6bcf3ddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,16 +1,18 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Generic, TypeVar
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
-from vllm.inputs import InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict
+from .inputs import MultiModalDataDict, MultiModalInputsV2
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
 
@@ -23,39 +25,19 @@ class ProcessorInputs:
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
 
-class BaseProfilingInfo(ABC):
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+
+
+class BaseDummyInputsBuilder(ABC, Generic[_I]):
     """
-    Abstract base class that provides the information necessary to profile
+    Abstract base class that constructs the dummy data to profile
     multi-modal models.
     """
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
+    def __init__(self, info: _I) -> None:
         super().__init__()
 
-        self.ctx = ctx
-
-    @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        """
-        Return the maximum supported number of items for each modality.
-
-        A value of `None` means unlimited number of items.
-
-        Omitting a modality from the returned dictionary means that
-        it is not supported at all.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
-
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
+        self.info = info
 
     @abstractmethod
     def get_dummy_processor_inputs(
@@ -64,8 +46,8 @@ def get_dummy_processor_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         """
-        Build the multi-modal portion of the input which, after processing,
-        results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`.
+        Build the input which, after processing, results in
+        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 
@@ -99,11 +81,33 @@ def _get_dummy_videos(
         video = np.zeros((num_frames, width, height, 3))
         return [video] * num_videos
 
-    def get_mm_limits(self) -> Mapping[str, int]:
-        mm_config = self.ctx.get_mm_config()
+
+class MultiModalProfiler(Generic[_I]):
+    """
+    Contains code for running memory profiling for multi-modal models.
+    """
+
+    def __init__(
+        self,
+        processor: BaseMultiModalProcessor[_I],
+    ) -> None:
+        super().__init__()
+
+        self.processor = processor
+
+    @property
+    def processing_info(self) -> BaseProcessingInfo:
+        return self.processor.info
+
+    @property
+    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
+        return self.processor.dummy_inputs
+
+    def _get_mm_limits(self) -> Mapping[str, int]:
+        mm_config = self.processing_info.ctx.get_mm_config()
         mm_limit_per_prompt = mm_config.limit_per_prompt
 
-        supported_mm_limits = self.get_supported_mm_limits()
+        supported_mm_limits = self.processing_info.get_supported_mm_limits()
 
         mm_limits = {
             modality: mm_limit_per_prompt.get(modality, 1)
@@ -119,3 +123,81 @@ def get_mm_limits(self) -> Mapping[str, int]:
                     f"at most {supported_limit} {modality} items.")
 
         return mm_limits
+
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalInputsV2:
+        factory = self.dummy_inputs
+        processor_inputs = factory.get_dummy_processor_inputs(
+            seq_len, mm_counts)
+
+        return self.processor.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+
+    def get_dummy_data(self, seq_len: int) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        mm_counts = self._get_mm_limits()
+
+        info = self.processing_info
+        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
+
+        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+            raise AssertionError(
+                "The keys returned by `get_supported_mm_limits`"
+                f"({set(mm_counts.keys())}) should be the same as those "
+                "returned by `get_mm_max_tokens_per_item` "
+                f"({set(mm_max_tokens_per_item.keys())})")
+
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
+
+        total_len = len(prompt_token_ids)
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
+
+            return DummyData(
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                multi_modal_data=None,
+                multi_modal_placeholders=None,
+            )
+
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(prompt_token_ids),
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
+        )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index f75a594a4c4e0..5f01eac4edade 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,7 +1,8 @@
 import functools
 from collections import UserDict
-from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol,
-                    Sequence, Type, TypeVar)
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
+                    Protocol, Sequence, Type, TypeVar)
 
 import torch.nn as nn
 
@@ -14,7 +15,9 @@
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import BaseMultiModalProcessor, ProcessingCache
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         ProcessingCache)
+from .profiling import BaseDummyInputsBuilder
 from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
@@ -27,20 +30,59 @@
 MM_CACHE_SIZE = 256
 
 N = TypeVar("N", bound=Type[nn.Module])
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
 
-class MultiModalProcessorFactory(Protocol):
+class ProcessingInfoFactory(Protocol[_I_co]):
     """Constructs a :class:`MultiModalProcessor` instance from the context."""
 
     def __call__(
         self,
         ctx: InputProcessingContext,
+    ) -> _I_co:
+        ...
+
+
+class DummyInputsBuilderFactory(Protocol[_I]):
+    """
+    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    """
+
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
+        ...
+
+
+class MultiModalProcessorFactory(Protocol[_I]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+
+    def __call__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
         cache: Optional[ProcessingCache] = None,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[_I]:
         ...
 
 
+@dataclass(frozen=True)
+class _ProcessorFactories(Generic[_I]):
+    info: ProcessingInfoFactory[_I]
+    processor: MultiModalProcessorFactory[_I]
+    dummy_inputs: DummyInputsBuilderFactory[_I]
+
+    def build_processor(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ):
+        info = self.info(ctx)
+        dummy_inputs_builder = self.dummy_inputs(info)
+        return self.processor(info, dummy_inputs_builder, cache=cache)
+
+
 class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
     """
     Wraps `_limits_by_model` for a more informative error message
@@ -71,7 +113,7 @@ def __init__(
         self._plugins = {p.get_data_key(): p for p in plugins}
 
         self._processor_factories = ClassRegistry[nn.Module,
-                                                  MultiModalProcessorFactory]()
+                                                  _ProcessorFactories]()
 
         # This is used for non-multimodal models
         self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
@@ -224,7 +266,7 @@ def get_max_tokens_per_item_by_modality(
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
-            return processor.profiling_info.get_mm_max_tokens_per_item(seq_len)
+            return processor.info.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
@@ -315,7 +357,10 @@ def get_mm_limits_per_prompt(
 
     def register_processor(
         self,
-        factory: MultiModalProcessorFactory,
+        processor: MultiModalProcessorFactory[_I],
+        *,
+        info: ProcessingInfoFactory[_I],
+        dummy_inputs: DummyInputsBuilderFactory[_I],
     ):
         """
         Register a multi-modal processor to a model class. The processor
@@ -336,7 +381,11 @@ def wrapper(model_cls: N) -> N:
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._processor_factories[model_cls] = factory
+            self._processor_factories[model_cls] = _ProcessorFactories(
+                info=info,
+                dummy_inputs=dummy_inputs,
+                processor=processor,
+            )
 
             return model_cls
 
@@ -359,15 +408,15 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
         model_cls = self._get_model_cls(model_config)
-        processor_factory = self._processor_factories[model_cls]
+        factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
         cache = (None if model_config.disable_mm_preprocessor_cache else
                  self._processing_cache)
 
-        return processor_factory(ctx, cache=cache)
+        return factories.build_processor(ctx, cache=cache)

From aba8d6ee006b78149ac4514f460e4038b2d4f607 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:09:53 +0000
Subject: [PATCH 16/55] [Doc] Move examples into categories (#11840)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-neuron-test.sh                 |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-tpu-test.sh                    |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 +-
 .buildkite/test-pipeline.yaml                 | 26 +++++------
 .github/workflows/lint-and-deploy.yaml        |  4 +-
 Dockerfile                                    |  2 +-
 .../contributing/profiling/profiling_index.md |  2 +-
 docs/source/deployment/frameworks/skypilot.md |  4 +-
 docs/source/features/disagg_prefill.md        |  2 +-
 docs/source/features/lora.md                  |  2 +-
 docs/source/features/quantization/auto_awq.md |  2 +-
 .../features/quantization/fp8_e4m3_kvcache.md |  2 +-
 docs/source/features/structured_outputs.md    |  4 +-
 docs/source/generate_examples.py              | 45 ++++++++++---------
 .../getting_started/installation/cpu-x86.md   |  4 +-
 .../getting_started/installation/xpu.md       |  2 +-
 docs/source/getting_started/quickstart.md     |  4 +-
 .../source/getting_started/troubleshooting.md |  2 +-
 docs/source/models/extensions/tensorizer.md   |  2 +-
 docs/source/models/generative_models.md       |  4 +-
 docs/source/models/pooling_models.md          |  6 +--
 docs/source/serving/distributed_serving.md    |  2 +-
 docs/source/serving/multimodal_inputs.md      | 16 +++----
 .../serving/openai_compatible_server.md       | 10 ++---
 .../{ => offline_inference}/aqlm_example.py   |  0
 .../{ => offline_inference}/cpu_offload.py    |  0
 .../florence2_inference.py                    |  3 +-
 .../{ => offline_inference}/gguf_inference.py |  0
 .../llm_engine_example.py                     |  0
 .../lora_with_quantization_inference.py       |  0
 .../multilora_inference.py                    |  0
 .../offline_chat_with_tools.py                |  0
 .../offline_inference.py                      |  0
 .../offline_inference_arctic.py               |  0
 .../offline_inference_audio_language.py       |  0
 .../offline_inference_chat.py                 |  0
 .../offline_inference_classification.py       |  0
 .../offline_inference_cli.py                  |  0
 .../offline_inference_distributed.py          |  0
 .../offline_inference_embedding.py            |  0
 .../offline_inference_encoder_decoder.py      |  0
 .../offline_inference_mlpspeculator.py        |  0
 .../offline_inference_neuron.py               |  0
 ...line_inference_neuron_int8_quantization.py |  0
 .../offline_inference_openai.md               | 18 ++++----
 .../openai_example_batch.jsonl                |  0
 .../offline_inference_pixtral.py              |  0
 .../offline_inference_scoring.py              |  0
 .../offline_inference_structured_outputs.py   |  0
 .../offline_inference_tpu.py                  |  0
 .../offline_inference_vision_language.py      |  0
 ...ine_inference_vision_language_embedding.py |  0
 ...e_inference_vision_language_multi_image.py |  0
 .../offline_inference_whisper.py              |  0
 ...nference_with_default_generation_config.py |  0
 .../offline_inference_with_prefix.py          |  0
 .../offline_inference_with_profiler.py        |  0
 .../offline_profile.py                        |  2 +-
 .../save_sharded_state.py                     |  0
 examples/{ => online_serving}/api_client.py   |  0
 .../chart-helm/.helmignore                    |  0
 .../chart-helm/Chart.yaml                     |  0
 examples/online_serving/chart-helm/README.md  | 21 +++++++++
 .../{ => online_serving}/chart-helm/ct.yaml   |  0
 .../chart-helm/lintconf.yaml                  |  0
 .../chart-helm/templates/_helpers.tpl         |  0
 .../chart-helm/templates/configmap.yaml       |  0
 .../chart-helm/templates/custom-objects.yaml  |  0
 .../chart-helm/templates/deployment.yaml      |  0
 .../chart-helm/templates/hpa.yaml             |  0
 .../chart-helm/templates/job.yaml             |  0
 .../templates/poddisruptionbudget.yaml        |  0
 .../chart-helm/templates/pvc.yaml             |  0
 .../chart-helm/templates/secrets.yaml         |  0
 .../chart-helm/templates/service.yaml         |  0
 .../chart-helm/values.schema.json             |  0
 .../chart-helm/values.yaml                    |  0
 .../disaggregated_prefill.sh                  |  0
 .../gradio_openai_chatbot_webserver.py        |  0
 .../{ => online_serving}/gradio_webserver.py  |  0
 .../openai_chat_completion_client.py          |  0
 ...i_chat_completion_client_for_multimodal.py |  0
 ...penai_chat_completion_client_with_tools.py |  0
 ...enai_chat_completion_structured_outputs.py |  0
 ...ai_chat_embedding_client_for_multimodal.py |  0
 .../openai_completion_client.py               |  0
 .../openai_cross_encoder_score.py             |  0
 .../openai_embedding_client.py                |  0
 .../openai_pooling_client.py                  |  0
 .../opentelemetry/Otel.md                     |  0
 .../opentelemetry/dummy_client.py             |  0
 .../prometheus_grafana/README.md              |  0
 .../prometheus_grafana/docker-compose.yaml    |  0
 .../prometheus_grafana/grafana.json           |  0
 .../prometheus_grafana/prometheus.yaml        |  0
 examples/{ => online_serving}/run_cluster.sh  |  0
 .../sagemaker-entrypoint.sh                   |  0
 examples/{ => other}/fp8/README.md            | 10 ++---
 examples/{ => other}/fp8/extract_scales.py    |  0
 examples/{ => other}/fp8/quantizer/README.md  |  0
 .../{ => other}/fp8/quantizer/quantize.py     |  0
 examples/{ => other}/logging_configuration.md |  0
 examples/{ => other}/tensorize_vllm_model.py  | 10 ++---
 pyproject.toml                                |  2 +-
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 tests/tensorizer_loader/test_tensorizer.py    |  4 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py | 10 ++---
 vllm/distributed/kv_transfer/README.md        |  2 +-
 vllm/model_executor/model_loader/loader.py    | 11 ++---
 .../model_executor/model_loader/tensorizer.py | 14 +++---
 .../model_loader/weight_utils.py              |  3 +-
 116 files changed, 153 insertions(+), 124 deletions(-)
 rename examples/{ => offline_inference}/aqlm_example.py (100%)
 rename examples/{ => offline_inference}/cpu_offload.py (100%)
 rename examples/{ => offline_inference}/florence2_inference.py (92%)
 rename examples/{ => offline_inference}/gguf_inference.py (100%)
 rename examples/{ => offline_inference}/llm_engine_example.py (100%)
 rename examples/{ => offline_inference}/lora_with_quantization_inference.py (100%)
 rename examples/{ => offline_inference}/multilora_inference.py (100%)
 rename examples/{ => offline_inference}/offline_chat_with_tools.py (100%)
 rename examples/{ => offline_inference}/offline_inference.py (100%)
 rename examples/{ => offline_inference}/offline_inference_arctic.py (100%)
 rename examples/{ => offline_inference}/offline_inference_audio_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_chat.py (100%)
 rename examples/{ => offline_inference}/offline_inference_classification.py (100%)
 rename examples/{ => offline_inference}/offline_inference_cli.py (100%)
 rename examples/{ => offline_inference}/offline_inference_distributed.py (100%)
 rename examples/{ => offline_inference}/offline_inference_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_encoder_decoder.py (100%)
 rename examples/{ => offline_inference}/offline_inference_mlpspeculator.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron_int8_quantization.py (100%)
 rename examples/{ => offline_inference/offline_inference_openai}/offline_inference_openai.md (90%)
 rename examples/{ => offline_inference/offline_inference_openai}/openai_example_batch.jsonl (100%)
 rename examples/{ => offline_inference}/offline_inference_pixtral.py (100%)
 rename examples/{ => offline_inference}/offline_inference_scoring.py (100%)
 rename examples/{ => offline_inference}/offline_inference_structured_outputs.py (100%)
 rename examples/{ => offline_inference}/offline_inference_tpu.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_multi_image.py (100%)
 rename examples/{ => offline_inference}/offline_inference_whisper.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_default_generation_config.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_prefix.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_profiler.py (100%)
 rename examples/{ => offline_inference}/offline_profile.py (99%)
 rename examples/{ => offline_inference}/save_sharded_state.py (100%)
 rename examples/{ => online_serving}/api_client.py (100%)
 rename examples/{ => online_serving}/chart-helm/.helmignore (100%)
 rename examples/{ => online_serving}/chart-helm/Chart.yaml (100%)
 create mode 100644 examples/online_serving/chart-helm/README.md
 rename examples/{ => online_serving}/chart-helm/ct.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/lintconf.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/_helpers.tpl (100%)
 rename examples/{ => online_serving}/chart-helm/templates/configmap.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/custom-objects.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/deployment.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/hpa.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/job.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/poddisruptionbudget.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/pvc.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/secrets.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/service.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/values.schema.json (100%)
 rename examples/{ => online_serving}/chart-helm/values.yaml (100%)
 rename examples/{ => online_serving}/disaggregated_prefill.sh (100%)
 rename examples/{ => online_serving}/gradio_openai_chatbot_webserver.py (100%)
 rename examples/{ => online_serving}/gradio_webserver.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_with_tools.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_structured_outputs.py (100%)
 rename examples/{ => online_serving}/openai_chat_embedding_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_cross_encoder_score.py (100%)
 rename examples/{ => online_serving}/openai_embedding_client.py (100%)
 rename examples/{ => online_serving}/openai_pooling_client.py (100%)
 rename examples/{ => online_serving}/opentelemetry/Otel.md (100%)
 rename examples/{ => online_serving}/opentelemetry/dummy_client.py (100%)
 rename examples/{ => online_serving}/prometheus_grafana/README.md (100%)
 rename examples/{ => online_serving}/prometheus_grafana/docker-compose.yaml (100%)
 rename examples/{ => online_serving}/prometheus_grafana/grafana.json (100%)
 rename examples/{ => online_serving}/prometheus_grafana/prometheus.yaml (100%)
 rename examples/{ => online_serving}/run_cluster.sh (100%)
 rename examples/{ => online_serving}/sagemaker-entrypoint.sh (100%)
 rename examples/{ => other}/fp8/README.md (88%)
 rename examples/{ => other}/fp8/extract_scales.py (100%)
 rename examples/{ => other}/fp8/quantizer/README.md (100%)
 rename examples/{ => other}/fp8/quantizer/quantize.py (100%)
 rename examples/{ => other}/logging_configuration.md (100%)
 rename examples/{ => other}/tensorize_vllm_model.py (96%)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index a4eca078568fd..87d08c8c7fdcb 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference.py"
+    python3 examples/offline_inference/offline_inference.py"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 4fc6d089cc666..1e5ff77895a38 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference.py
+    python3 examples/offline_inference/offline_inference.py
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index fa4f74fca7a11..a50570ab53438 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index aa29c434e7cfb..52d485939b1d0 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 6b12f424fd828..380f7a44a429a 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 770dad6ffa3a1..13605a3e97142 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index e0a12afbe7320..160e10aa3bb9b 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference.py
-    python3 examples/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/offline_inference_cli.py -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index dcfe228ce8eae..b7178b94f481a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference.py
-    - python3 cpu_offload.py
-    - python3 offline_inference_chat.py
-    - python3 offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
-    - python3 offline_inference_vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py
-    - python3 offline_inference_classification.py
-    - python3 offline_inference_embedding.py
-    - python3 offline_inference_scoring.py
-    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/cpu_offload.py
+    - python3 offline_inference/offline_inference_chat.py
+    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/offline_inference_vision_language.py
+    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/offline_inference_encoder_decoder.py
+    - python3 offline_inference/offline_inference_classification.py
+    - python3 offline_inference/offline_inference_embedding.py
+    - python3 offline_inference/offline_inference_scoring.py
+    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ab6f6e5d2060d..ee768db63c96c 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -27,7 +27,7 @@ jobs:
           version: v3.10.1
 
       - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
 
       - name: Setup minio
         run: |
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
         run: |
diff --git a/Dockerfile b/Dockerfile
index 088314eb38dbe..808cf675acf4d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
 
-COPY examples/sagemaker-entrypoint.sh .
+COPY examples/online_serving/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 46210957c19ec..97de40ff469f1 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 
 ### Offline Inference
 
-Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
 
 ### OpenAI Server
 
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index f02a943026922..657e7f2bc72cc 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -61,7 +61,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://localhost:8081/v1 \
@@ -321,7 +321,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://$ENDPOINT/v1 \
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 645dc60807dd3..efa2efc66192e 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
 
 ## Usage example
 
-Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
 
 ## Benchmarks
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index cf06916d70f44..b00d05147bb32 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -47,7 +47,7 @@ outputs = llm.generate(
 )
 ```
 
-Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 
 ## Serving LoRA Adapters
 
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index c02fbf0605a8c..3679595e3d4d0 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
index f200c722d1d42..50edaf81fddd3 100644
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
@@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
 
 ```python
 # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
-# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
 
 from vllm import LLM, SamplingParams
 sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 26c09bb0d8a0c..ccd9a6a1b1a14 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -131,7 +131,7 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 32bb86c469c78..aaa13d0fb6d3f 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -12,6 +12,7 @@
 def fix_case(text: str) -> str:
     subs = {
         "api": "API",
+        "Cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "tpu": "TPU",
@@ -58,7 +59,7 @@ def generate(self) -> str:
         content = f"# {self.title}\n\n{self.description}\n\n"
         content += "```{toctree}\n"
         content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        content += "\n".join(self.documents) + "\n```\n"
         return content
 
 
@@ -131,11 +132,14 @@ def generate(self) -> str:
             ROOT_DIR)
 
         content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
-        if self.main_file.suffix == ".py":
-            content += f"# {self.title}\n\n"
         include = "include" if self.main_file.suffix == ".md" else \
             "literalinclude"
-        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+        if include == "literalinclude":
+            content += f"# {self.title}\n\n"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n"
+        if include == "literalinclude":
+            content += f":language: {self.main_file.suffix[1:]}\n"
+        content += ":::\n\n"
 
         if not self.other_files:
             return content
@@ -163,14 +167,16 @@ def generate_examples():
         description=
         "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
         caption="Examples",
-        maxdepth=1)  # TODO change to 2 when examples start being categorised
+        maxdepth=2)
+    # Category indices stored in reverse order because they are inserted into
+    # examples_index.documents at index 0 in order
     category_indices = {
-        "offline_inference":
+        "other":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
-            title="Offline Inference",
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
             description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
             caption="Examples",
         ),
         "online_serving":
@@ -181,31 +187,30 @@ def generate_examples():
             "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
             caption="Examples",
         ),
-        "other":
+        "offline_inference":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
-            title="Other",
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
             description=
-            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
             caption="Examples",
         ),
     }
 
     examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
     # Find categorised examples
     for category in category_indices:
         category_dir = EXAMPLE_DIR / category
-        py = category_dir.glob("*.py")
-        md = category_dir.glob("*.md")
-        for path in itertools.chain(py, md):
+        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
             examples.append(Example(path, category))
         # Find examples in subdirectories
         for path in category_dir.glob("*/*.md"):
             examples.append(Example(path.parent, category))
     # Find uncategorised examples
-    py = EXAMPLE_DIR.glob("*.py")
-    md = EXAMPLE_DIR.glob("*.md")
-    for path in itertools.chain(py, md):
+    globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
+    for path in itertools.chain(*globs):
         examples.append(Example(path))
     # Find examples in subdirectories
     for path in EXAMPLE_DIR.glob("*/*.md"):
@@ -215,7 +220,7 @@ def generate_examples():
         examples.append(Example(path.parent))
 
     # Generate the example documentation
-    for example in examples:
+    for example in sorted(examples, key=lambda e: e.path.stem):
         doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
             f.write(example.generate())
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index bbb2d1872ef39..bb046dd0fd9dc 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference.py # run vLLM
+$ python examples/offline_inference/offline_inference.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference.py
+$ python examples/offline_inference/offline_inference.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md
index be4e3b9bd1bc5..c1ab5478eb652 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -71,4 +71,4 @@ $      --pipeline-parallel-size=2 \
 $      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 3f9556165ece4..6b56918ce5638 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
@@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
 print("Completion result:", completion)
 ```
 
-A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
+A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
 
 ### OpenAI Chat Completions API with vLLM
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 5a0310da0f2cb..f5efe0bef7506 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Model is too large
 
-If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Enable more logging
 
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index 42ed5c795dd27..ae17e3437bca6 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
 
 ```{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 383299d61b5dd..6228c7c2ac957 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 12ded68eb30b5..3e4407cfdc233 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -65,7 +65,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
 
 ### `LLM.classify`
 
@@ -80,7 +80,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
 
 ### `LLM.score`
 
@@ -102,7 +102,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
 ## Online Inference
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index b1703249d7224..4e0a9ef6ecf7d 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -51,7 +51,7 @@ $     --pipeline-parallel-size 2
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 0efa09f2869ca..9f5e1b908d786 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 ### Audio
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
 
 ### Embedding
 
@@ -271,7 +271,7 @@ chat_response = client.chat.completions.create(
 print("Chat completion output:", chat_response.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ```{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from image url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from audio url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th
 example below for details.
 ```
 
-Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1e5ea6357d202..022dd3ae8a237 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -191,7 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: <gh-file:examples/openai_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
@@ -222,7 +222,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
-Code example: <gh-file:examples/openai_chat_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
@@ -255,7 +255,7 @@ which will be treated as a single prompt to the model.
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
 ```
 
-Code example: <gh-file:examples/openai_embedding_client.py>
+Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
@@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-Code example: <gh-file:examples/openai_pooling_client.py>
+Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 (score-api)=
 ### Score API
@@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: <gh-file:examples/openai_cross_encoder_score.py>
+Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
 
 #### Single inference
 
diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py
similarity index 100%
rename from examples/aqlm_example.py
rename to examples/offline_inference/aqlm_example.py
diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py
similarity index 100%
rename from examples/cpu_offload.py
rename to examples/offline_inference/cpu_offload.py
diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py
similarity index 92%
rename from examples/florence2_inference.py
rename to examples/offline_inference/florence2_inference.py
index b58ac2e1f7ed4..49dd2c331db5a 100644
--- a/examples/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -3,7 +3,8 @@
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference_vision_language.py after porting vision backbone
+# Move to offline_inference/offline_inference_vision_language.py
+# after porting vision backbone
 from vllm import LLM, SamplingParams
 
 dtype = "float"
diff --git a/examples/gguf_inference.py b/examples/offline_inference/gguf_inference.py
similarity index 100%
rename from examples/gguf_inference.py
rename to examples/offline_inference/gguf_inference.py
diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
similarity index 100%
rename from examples/llm_engine_example.py
rename to examples/offline_inference/llm_engine_example.py
diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
similarity index 100%
rename from examples/lora_with_quantization_inference.py
rename to examples/offline_inference/lora_with_quantization_inference.py
diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py
similarity index 100%
rename from examples/multilora_inference.py
rename to examples/offline_inference/multilora_inference.py
diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/offline_chat_with_tools.py
similarity index 100%
rename from examples/offline_chat_with_tools.py
rename to examples/offline_inference/offline_chat_with_tools.py
diff --git a/examples/offline_inference.py b/examples/offline_inference/offline_inference.py
similarity index 100%
rename from examples/offline_inference.py
rename to examples/offline_inference/offline_inference.py
diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/offline_inference_arctic.py
similarity index 100%
rename from examples/offline_inference_arctic.py
rename to examples/offline_inference/offline_inference_arctic.py
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/offline_inference_audio_language.py
similarity index 100%
rename from examples/offline_inference_audio_language.py
rename to examples/offline_inference/offline_inference_audio_language.py
diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/offline_inference_chat.py
similarity index 100%
rename from examples/offline_inference_chat.py
rename to examples/offline_inference/offline_inference_chat.py
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/offline_inference_classification.py
similarity index 100%
rename from examples/offline_inference_classification.py
rename to examples/offline_inference/offline_inference_classification.py
diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/offline_inference_cli.py
similarity index 100%
rename from examples/offline_inference_cli.py
rename to examples/offline_inference/offline_inference_cli.py
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/offline_inference_distributed.py
similarity index 100%
rename from examples/offline_inference_distributed.py
rename to examples/offline_inference/offline_inference_distributed.py
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/offline_inference_embedding.py
similarity index 100%
rename from examples/offline_inference_embedding.py
rename to examples/offline_inference/offline_inference_embedding.py
diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/offline_inference_encoder_decoder.py
similarity index 100%
rename from examples/offline_inference_encoder_decoder.py
rename to examples/offline_inference/offline_inference_encoder_decoder.py
diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/offline_inference_mlpspeculator.py
similarity index 100%
rename from examples/offline_inference_mlpspeculator.py
rename to examples/offline_inference/offline_inference_mlpspeculator.py
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/offline_inference_neuron.py
similarity index 100%
rename from examples/offline_inference_neuron.py
rename to examples/offline_inference/offline_inference_neuron.py
diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/offline_inference_neuron_int8_quantization.py
similarity index 100%
rename from examples/offline_inference_neuron_int8_quantization.py
rename to examples/offline_inference/offline_inference_neuron_int8_quantization.py
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
similarity index 90%
rename from examples/offline_inference_openai.md
rename to examples/offline_inference/offline_inference_openai/offline_inference_openai.md
index 2436417cb543a..6278a1943fe4a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
  
 The OpenAI batch file format consists of a series of json objects on new lines.
  
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
  
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
@@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -66,10 +66,10 @@ $ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```
-aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
similarity index 100%
rename from examples/openai_example_batch.jsonl
rename to examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/offline_inference_pixtral.py
similarity index 100%
rename from examples/offline_inference_pixtral.py
rename to examples/offline_inference/offline_inference_pixtral.py
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/offline_inference_scoring.py
similarity index 100%
rename from examples/offline_inference_scoring.py
rename to examples/offline_inference/offline_inference_scoring.py
diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/offline_inference_structured_outputs.py
similarity index 100%
rename from examples/offline_inference_structured_outputs.py
rename to examples/offline_inference/offline_inference_structured_outputs.py
diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/offline_inference_tpu.py
similarity index 100%
rename from examples/offline_inference_tpu.py
rename to examples/offline_inference/offline_inference_tpu.py
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/offline_inference_vision_language.py
similarity index 100%
rename from examples/offline_inference_vision_language.py
rename to examples/offline_inference/offline_inference_vision_language.py
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/offline_inference_vision_language_embedding.py
similarity index 100%
rename from examples/offline_inference_vision_language_embedding.py
rename to examples/offline_inference/offline_inference_vision_language_embedding.py
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/offline_inference_vision_language_multi_image.py
similarity index 100%
rename from examples/offline_inference_vision_language_multi_image.py
rename to examples/offline_inference/offline_inference_vision_language_multi_image.py
diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/offline_inference_whisper.py
similarity index 100%
rename from examples/offline_inference_whisper.py
rename to examples/offline_inference/offline_inference_whisper.py
diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/offline_inference_with_default_generation_config.py
similarity index 100%
rename from examples/offline_inference_with_default_generation_config.py
rename to examples/offline_inference/offline_inference_with_default_generation_config.py
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/offline_inference_with_prefix.py
similarity index 100%
rename from examples/offline_inference_with_prefix.py
rename to examples/offline_inference/offline_inference_with_prefix.py
diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/offline_inference_with_profiler.py
similarity index 100%
rename from examples/offline_inference_with_profiler.py
rename to examples/offline_inference/offline_inference_with_profiler.py
diff --git a/examples/offline_profile.py b/examples/offline_inference/offline_profile.py
similarity index 99%
rename from examples/offline_profile.py
rename to examples/offline_inference/offline_profile.py
index 46afe8aa2604b..187a05e4d70a2 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_inference/offline_profile.py
@@ -363,7 +363,7 @@ def abort_requests():
 
     example:
     ```
-    python examples/offline_profile.py \\
+    python examples/offline_inference/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
         --enforce-eager run_num_steps -n 2
diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
similarity index 100%
rename from examples/save_sharded_state.py
rename to examples/offline_inference/save_sharded_state.py
diff --git a/examples/api_client.py b/examples/online_serving/api_client.py
similarity index 100%
rename from examples/api_client.py
rename to examples/online_serving/api_client.py
diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore
similarity index 100%
rename from examples/chart-helm/.helmignore
rename to examples/online_serving/chart-helm/.helmignore
diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml
similarity index 100%
rename from examples/chart-helm/Chart.yaml
rename to examples/online_serving/chart-helm/Chart.yaml
diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
new file mode 100644
index 0000000000000..6aa126d4fd22c
--- /dev/null
+++ b/examples/online_serving/chart-helm/README.md
@@ -0,0 +1,21 @@
+# Helm Charts
+
+This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
+
+## Files
+
+- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
+- ct.yaml: Configuration for chart testing.
+- lintconf.yaml: Linting rules for YAML files.
+- values.schema.json: JSON schema for validating values.yaml.
+- values.yaml: Default values for the Helm chart.
+- templates/_helpers.tpl: Helper templates for defining common configurations.
+- templates/configmap.yaml: Template for creating ConfigMaps.
+- templates/custom-objects.yaml: Template for custom Kubernetes objects.
+- templates/deployment.yaml: Template for creating Deployments.
+- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
+- templates/job.yaml: Template for Kubernetes Jobs.
+- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
+- templates/pvc.yaml: Template for Persistent Volume Claims.
+- templates/secrets.yaml: Template for Kubernetes Secrets.
+- templates/service.yaml: Template for creating Services.
\ No newline at end of file
diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml
similarity index 100%
rename from examples/chart-helm/ct.yaml
rename to examples/online_serving/chart-helm/ct.yaml
diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml
similarity index 100%
rename from examples/chart-helm/lintconf.yaml
rename to examples/online_serving/chart-helm/lintconf.yaml
diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl
similarity index 100%
rename from examples/chart-helm/templates/_helpers.tpl
rename to examples/online_serving/chart-helm/templates/_helpers.tpl
diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml
similarity index 100%
rename from examples/chart-helm/templates/configmap.yaml
rename to examples/online_serving/chart-helm/templates/configmap.yaml
diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml
similarity index 100%
rename from examples/chart-helm/templates/custom-objects.yaml
rename to examples/online_serving/chart-helm/templates/custom-objects.yaml
diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml
similarity index 100%
rename from examples/chart-helm/templates/deployment.yaml
rename to examples/online_serving/chart-helm/templates/deployment.yaml
diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml
similarity index 100%
rename from examples/chart-helm/templates/hpa.yaml
rename to examples/online_serving/chart-helm/templates/hpa.yaml
diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml
similarity index 100%
rename from examples/chart-helm/templates/job.yaml
rename to examples/online_serving/chart-helm/templates/job.yaml
diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
similarity index 100%
rename from examples/chart-helm/templates/poddisruptionbudget.yaml
rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml
similarity index 100%
rename from examples/chart-helm/templates/pvc.yaml
rename to examples/online_serving/chart-helm/templates/pvc.yaml
diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml
similarity index 100%
rename from examples/chart-helm/templates/secrets.yaml
rename to examples/online_serving/chart-helm/templates/secrets.yaml
diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml
similarity index 100%
rename from examples/chart-helm/templates/service.yaml
rename to examples/online_serving/chart-helm/templates/service.yaml
diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json
similarity index 100%
rename from examples/chart-helm/values.schema.json
rename to examples/online_serving/chart-helm/values.schema.json
diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
similarity index 100%
rename from examples/chart-helm/values.yaml
rename to examples/online_serving/chart-helm/values.yaml
diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
similarity index 100%
rename from examples/disaggregated_prefill.sh
rename to examples/online_serving/disaggregated_prefill.sh
diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
similarity index 100%
rename from examples/gradio_openai_chatbot_webserver.py
rename to examples/online_serving/gradio_openai_chatbot_webserver.py
diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
similarity index 100%
rename from examples/gradio_webserver.py
rename to examples/online_serving/gradio_webserver.py
diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
similarity index 100%
rename from examples/openai_chat_completion_client.py
rename to examples/online_serving/openai_chat_completion_client.py
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_completion_client_for_multimodal.py
rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
similarity index 100%
rename from examples/openai_chat_completion_client_with_tools.py
rename to examples/online_serving/openai_chat_completion_client_with_tools.py
diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
similarity index 100%
rename from examples/openai_chat_completion_structured_outputs.py
rename to examples/online_serving/openai_chat_completion_structured_outputs.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_embedding_client_for_multimodal.py
rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py
diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
similarity index 100%
rename from examples/openai_completion_client.py
rename to examples/online_serving/openai_completion_client.py
diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
similarity index 100%
rename from examples/openai_cross_encoder_score.py
rename to examples/online_serving/openai_cross_encoder_score.py
diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
similarity index 100%
rename from examples/openai_embedding_client.py
rename to examples/online_serving/openai_embedding_client.py
diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
similarity index 100%
rename from examples/openai_pooling_client.py
rename to examples/online_serving/openai_pooling_client.py
diff --git a/examples/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/Otel.md
similarity index 100%
rename from examples/opentelemetry/Otel.md
rename to examples/online_serving/opentelemetry/Otel.md
diff --git a/examples/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/opentelemetry/dummy_client.py
rename to examples/online_serving/opentelemetry/dummy_client.py
diff --git a/examples/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
similarity index 100%
rename from examples/prometheus_grafana/README.md
rename to examples/online_serving/prometheus_grafana/README.md
diff --git a/examples/prometheus_grafana/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/prometheus_grafana/docker-compose.yaml
rename to examples/online_serving/prometheus_grafana/docker-compose.yaml
diff --git a/examples/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/prometheus_grafana/grafana.json
rename to examples/online_serving/prometheus_grafana/grafana.json
diff --git a/examples/prometheus_grafana/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/prometheus_grafana/prometheus.yaml
rename to examples/online_serving/prometheus_grafana/prometheus.yaml
diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh
similarity index 100%
rename from examples/run_cluster.sh
rename to examples/online_serving/run_cluster.sh
diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh
similarity index 100%
rename from examples/sagemaker-entrypoint.sh
rename to examples/online_serving/sagemaker-entrypoint.sh
diff --git a/examples/fp8/README.md b/examples/other/fp8/README.md
similarity index 88%
rename from examples/fp8/README.md
rename to examples/other/fp8/README.md
index 5492872cae93a..4e8031d954113 100644
--- a/examples/fp8/README.md
+++ b/examples/other/fp8/README.md
@@ -20,12 +20,12 @@ Before incorporating the FP8 datatype for inference workloads, you must adhere t
 ### 2. Convert HF model into a quantized HF model.
 Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
 
-`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
+`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
 
-The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`.
+The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`.
 
 ### 3. Extract KV Cache Scaling Factors from quantized HF model.
-`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
+`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
 
 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
@@ -35,7 +35,7 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found a
 ```python
 # prerequisites:
 # - Quantized HF LLaMa 2 model 
-python3 examples/fp8/extract_scales.py --help
+python3 examples/other/fp8/extract_scales.py --help
 Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
 
 KV Scale Extraction Example
@@ -52,7 +52,7 @@ Optional arguments:
 ```
 ```python
 Example:
-python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
+python3 examples/other/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
diff --git a/examples/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py
similarity index 100%
rename from examples/fp8/extract_scales.py
rename to examples/other/fp8/extract_scales.py
diff --git a/examples/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md
similarity index 100%
rename from examples/fp8/quantizer/README.md
rename to examples/other/fp8/quantizer/README.md
diff --git a/examples/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py
similarity index 100%
rename from examples/fp8/quantizer/quantize.py
rename to examples/other/fp8/quantizer/quantize.py
diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md
similarity index 100%
rename from examples/logging_configuration.md
rename to examples/other/logging_configuration.md
diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
similarity index 96%
rename from examples/tensorize_vllm_model.py
rename to examples/other/tensorize_vllm_model.py
index dd77a4ad0c6b7..5fff1fdf502c9 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -25,7 +25,7 @@
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -45,7 +45,7 @@
 To deserialize a model, you can run something like this from the root 
 level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
@@ -63,11 +63,11 @@
 model-rank-%03d.tensors
 
 For more information on the available arguments for serializing, run 
-`python -m examples.tensorize_vllm_model serialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model serialize --help`.
 
 Or for deserializing:
 
-`python -m examples.tensorize_vllm_model deserialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`.
 
 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -88,7 +88,7 @@
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:
 
-`python -m examples.tensorize_vllm_model deserialize --help`
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`
 
 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and
diff --git a/pyproject.toml b/pyproject.toml
index 45fa4bff4e680..0ac3f39ef7a5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta"
 line-length = 80
 exclude = [
     # External file, leaving license intact
-    "examples/fp8/quantizer/quantize.py"
+    "examples/other/fp8/quantizer/quantize.py"
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 0d27cf9f152e0..57518bd3e8299 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,7 +5,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference.py")
+        "examples", "offline_inference/offline_inference.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 0b0792b6b845f..bf409d2d97aa1 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -163,8 +163,8 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
 
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     multilora_inference = import_from_path(
-        "examples.multilora_inference",
-        EXAMPLES_PATH / "multilora_inference.py",
+        "examples.offline_inference.multilora_inference",
+        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
     )
 
     model_ref = "meta-llama/Llama-2-7b-hf"
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 394ca8663e189..49366abc7fb56 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
                         type=str,
                         required=True,
                         help="json trace file output by "
-                        "examples/offline_profile.py")
+                        "examples/offline_inference/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
                         required=True,
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index da7a28da15c19..fa88ed4204d8f 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -534,11 +534,11 @@ def make_plot_title_suffix(profile_json: dict) -> str:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        "--json-trace",
-        type=str,
-        required=True,
-        help="json trace file output by examples/offline_profile.py")
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by \
+                              examples/offline_inference/offline_profile.py")
     parser.add_argument("--output-directory",
                         type=str,
                         required=False,
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index dab2d10c4c9d0..e20c992a381a3 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
 
 ## Disaggregated prefilling
 
-The example usage is in [this file](../../../examples/disaggregated_prefill.sh).
+The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
 
 Here is the diagram of how we run disaggretgated prefilling.
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a9c1fa7221217..0033fbff0e9ac 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -452,9 +452,9 @@ def _load_model_serialized_cpu(
         """Load a serialized model with tensorizer to the CPU.
 
         This is only necessary when the model isn't vLLM-tensorized (see
-        examples/tensorize_vllm_model.py) This should still be faster than
-        default HuggingFace loading, but will be slower than loading a
-        vLLM-tensorized model.
+        examples/other/tensorize_vllm_model.py) This should still
+        be faster than default HuggingFace loading, but will be slower than
+        loading a vLLM-tensorized model.
         """
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
@@ -472,7 +472,7 @@ def _load_model_serialized(
         """Load a serialized model with tensorizer.
 
         Expects a vLLM-tensorized model. See the
-        examples/tensorize_vllm_model.py example script
+        examples/other/tensorize_vllm_model.py example script
         for serializing vLLM models."""
 
         device_config = vllm_config.device_config
@@ -529,7 +529,8 @@ class ShardedStateLoader(BaseModelLoader):
     Model loader that directly loads each worker's model state dict, which
     enables a fast load path for large tensor-parallel models where each worker
     only needs to read its own shard rather than the entire checkpoint. See
-    `examples/save_sharded_state.py` for creating a sharded checkpoint.
+    `examples/offline_inference/save_sharded_state.py` for creating a sharded
+    checkpoint.
     """
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 8b929f299c8d8..fbd4937112e11 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -155,7 +155,7 @@ class TensorizerArgs:
       encryption_keyfile: File path to a binary file containing a  
           binary key to use for decryption. `None` (the default) means 
           no decryption. See the example script in 
-          examples/tensorize_vllm_model.py. 
+          examples/other/tensorize_vllm_model.py. 
       s3_access_key_id: The access key for the S3 bucket. Can also be set via
           the S3_ACCESS_KEY_ID environment variable.
       s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -363,12 +363,12 @@ def deserialize(self):
 def tensorizer_weights_iterator(
     tensorizer_args: "TensorizerArgs"
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    logger.warning(
-        "Deserializing HuggingFace models is not optimized for "
-        "loading on vLLM, as tensorizer is forced to load to CPU. "
-        "Consider deserializing a vLLM model instead for faster "
-        "load times. See the examples/tensorize_vllm_model.py example "
-        "script for serializing vLLM models.")
+    logger.warning("Deserializing HuggingFace models is not optimized for "
+                   "loading on vLLM, as tensorizer is forced to load to CPU. "
+                   "Consider deserializing a vLLM model instead for faster "
+                   "load times. See the "
+                   "examples/other/tensorize_vllm_model.py example script "
+                   "for serializing vLLM models.")
 
     deserializer_args = tensorizer_args.deserializer_params
     stream_params = tensorizer_args.stream_params
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8aa0c98df70d2..a2c991cfdb74e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -503,7 +503,8 @@ def kv_cache_scales_loader(
     KV cache scaling factors. The serialization should represent a dictionary
     whose keys are the TP ranks and values are another dictionary mapping layers
     to their KV cache scaling factors.
-    Keep this function in sync with the output of examples/fp8/extract_scales.py
+    Keep this function in sync with the output of
+    examples/other/fp8/extract_scales.py
     """
     try:
         with open(filename) as f:

From 6cd40a5bfed24ef0ceca83b0450be6920d8ca6d4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 21:34:44 +0800
Subject: [PATCH 17/55] [Doc][4/N] Reorganize API Reference (#11843)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 Dockerfile                                    |   4 ++--
 .../{dev => api}/engine/async_llm_engine.md   |   0
 .../engine_index.md => api/engine/index.md}   |   0
 docs/source/{dev => api}/engine/llm_engine.md |   0
 .../multimodal/index.md}                      |  10 --------
 .../offline_inference/index.md}               |   0
 .../{dev => api}/offline_inference/llm.md     |   0
 .../offline_inference/llm_inputs.md           |   0
 docs/source/api/params.md                     |  22 ++++++++++++++++++
 .../dockerfile-stages-dependency.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 .../multimodal/adding_multimodal_plugin.md    |  16 -------------
 docs/source/dev/pooling_params.md             |   6 -----
 docs/source/dev/sampling_params.md            |   6 -----
 docs/source/getting_started/quickstart.md     |   2 +-
 docs/source/index.md                          |   9 ++++---
 docs/source/serving/offline_inference.md      |   2 +-
 .../serving/openai_compatible_server.md       |   8 +++----
 vllm/multimodal/base.py                       |   3 ---
 vllm/multimodal/inputs.py                     |   6 -----
 vllm/multimodal/registry.py                   |   3 ---
 vllm/pooling_params.py                        |   2 +-
 24 files changed, 38 insertions(+), 67 deletions(-)
 rename docs/source/{dev => api}/engine/async_llm_engine.md (100%)
 rename docs/source/{dev/engine/engine_index.md => api/engine/index.md} (100%)
 rename docs/source/{dev => api}/engine/llm_engine.md (100%)
 rename docs/source/{design/multimodal/multimodal_index.md => api/multimodal/index.md} (84%)
 rename docs/source/{dev/offline_inference/offline_index.md => api/offline_inference/index.md} (100%)
 rename docs/source/{dev => api}/offline_inference/llm.md (100%)
 rename docs/source/{dev => api}/offline_inference/llm_inputs.md (100%)
 create mode 100644 docs/source/api/params.md
 rename docs/source/assets/{dev => contributing}/dockerfile-stages-dependency.png (100%)
 delete mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.md
 delete mode 100644 docs/source/dev/pooling_params.md
 delete mode 100644 docs/source/dev/sampling_params.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b7178b94f481a..f883595f6d9ad 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
+  - grep \"sig sig-object py\" build/html/api/params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/Dockerfile b/Dockerfile
index 808cf675acf4d..4542bc9cf0bd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.md and
-# docs/source/assets/dev/dockerfile-stages-dependency.png
+# docs/source/contributing/dockerfile/dockerfile.md and
+# docs/source/assets/contributing/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/async_llm_engine.md
rename to docs/source/api/engine/async_llm_engine.md
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md
similarity index 100%
rename from docs/source/dev/engine/engine_index.md
rename to docs/source/api/engine/index.md
diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/llm_engine.md
rename to docs/source/api/engine/llm_engine.md
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/api/multimodal/index.md
similarity index 84%
rename from docs/source/design/multimodal/multimodal_index.md
rename to docs/source/api/multimodal/index.md
index e4f2171e84ff7..0046b73ea825e 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/api/multimodal/index.md
@@ -11,18 +11,8 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following [this guide](#adding-multimodal-plugin).
-
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-adding_multimodal_plugin
-```
 
 ## Module Contents
 
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md
similarity index 100%
rename from docs/source/dev/offline_inference/offline_index.md
rename to docs/source/api/offline_inference/index.md
diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm.md
rename to docs/source/api/offline_inference/llm.md
diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm_inputs.md
rename to docs/source/api/offline_inference/llm_inputs.md
diff --git a/docs/source/api/params.md b/docs/source/api/params.md
new file mode 100644
index 0000000000000..a3b4d9cbb44ec
--- /dev/null
+++ b/docs/source/api/params.md
@@ -0,0 +1,22 @@
+# Optional Parameters
+
+Optional parameters for vLLM APIs.
+
+(sampling-params)=
+
+## Sampling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.SamplingParams
+    :members:
+```
+
+(pooling-params)=
+
+## Pooling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.PoolingParams
+    :members:
+```
+
diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
similarity index 100%
rename from docs/source/assets/dev/dockerfile-stages-dependency.png
rename to docs/source/assets/contributing/dockerfile-stages-dependency.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 38ea956ba8dfb..cb142318b8724 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,7 +17,7 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 5e0dd021ad02e..cec503ef2f77d 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -53,7 +53,7 @@ for output in outputs:
 ```
 
 More API details can be found in the {doc}`Offline Inference
-</dev/offline_inference/offline_index>` section of the API docs.
+</api/offline_inference/index>` section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
deleted file mode 100644
index bcccd284879bb..0000000000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.md
+++ /dev/null
@@ -1,16 +0,0 @@
-(adding-multimodal-plugin)=
-
-# Adding a Multimodal Plugin
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
-
-```{note}
-This article is a work in progress.
-```
-
-% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md
deleted file mode 100644
index 74b2c57443e4b..0000000000000
--- a/docs/source/dev/pooling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Pooling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.PoolingParams
-    :members:
-```
diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md
deleted file mode 100644
index bdc36af5153db..0000000000000
--- a/docs/source/dev/sampling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Sampling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.SamplingParams
-    :members:
-```
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 6b56918ce5638..2808e1b386801 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -42,7 +42,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class
 from vllm import LLM, SamplingParams
 ```
 
-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
 
 ```python
 prompts = [
diff --git a/docs/source/index.md b/docs/source/index.md
index 11d3e24a9b60a..6747a7fcce4fe 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -137,10 +137,10 @@ community/sponsors
 :caption: API Reference
 :maxdepth: 2
 
-dev/sampling_params
-dev/pooling_params
-dev/offline_inference/offline_index
-dev/engine/engine_index
+api/offline_inference/index
+api/engine/index
+api/multimodal/index
+api/params
 ```
 
 % Design Documents: Details about vLLM internals
@@ -154,7 +154,6 @@ design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
 design/input_processing/model_inputs_index
-design/multimodal/multimodal_index
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 83178f7811825..79092ab208784 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -23,7 +23,7 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.
 
 ```{seealso}
-[API Reference](/dev/offline_inference/offline_index)
+[API Reference](/api/offline_inference/index)
 ```
 
 ## Configuration Options
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 022dd3ae8a237..ec5a367594743 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -195,7 +195,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -226,7 +226,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -259,7 +259,7 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -447,7 +447,7 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7f4029e726332..4941fbac963ca 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -49,9 +49,6 @@ class MultiModalPlugin(ABC):
     process the same data differently). This registry is in turn used by
     :class:`~MultiModalRegistry` which acts at a higher level
     (i.e., the modality of the data).
-
-    See also:
-        :ref:`adding-multimodal-plugin`
     """
 
     def __init__(self) -> None:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8fdcc4b524035..d542461874866 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,12 +99,6 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalDataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5f01eac4edade..9eceefb08c93f 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -125,9 +125,6 @@ def __init__(
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
-
-        See also:
-            :ref:`adding-multimodal-plugin`
         """
         data_type_key = plugin.get_data_key()
 
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 2635c0bccd1c4..b24b7e91a7ae7 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """Pooling parameters for embeddings API.
+    """API parameters for pooling models. This is currently a placeholder.
 
     Attributes:
         additional_data: Any additional data needed for pooling.

From 2f7024987e582b85b280909b87287668cd97c92f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 8 Jan 2025 23:18:28 +0800
Subject: [PATCH 18/55] [CI/Build][Bugfix] Fix CPU CI image clean up (#11836)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/run-cpu-test.sh               | 7 ++-----
 vllm/model_executor/layers/activation.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 87d08c8c7fdcb..1a4dae8f65e99 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -13,7 +13,7 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BU
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
@@ -35,10 +35,7 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install pytest pytest-asyncio \
-      decord einops librosa peft Pillow sentence-transformers soundfile \
-      transformers_stream_generator matplotlib datamodel_code_generator
-    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    pip install -r vllm/requirements-test.txt
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 46d4670bfe4f9..b8a302cf5087f 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -61,7 +61,7 @@ class SiluAndMul(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             import intel_extension_for_pytorch as ipex

From 78f4590b60161dee1a444870ae682ba45f633502 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Thu, 9 Jan 2025 00:11:50 +0800
Subject: [PATCH 19/55] [Bugfix][XPU] fix silu_and_mul (#11823)

Signed-off-by: yan ma <yan.ma@intel.com>
---
 vllm/model_executor/layers/activation.py | 4 ++--
 vllm/plugins/__init__.py                 | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index b8a302cf5087f..32456fee06a28 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -64,8 +64,8 @@ def __init__(self):
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
-            import intel_extension_for_pytorch as ipex
-            self.op = ipex.llm.functional.silu_and_mul
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index c50eb2cef4cd5..e5fa4f0e4a2f6 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -63,8 +63,8 @@ def load_general_plugins():
     from vllm.platforms import current_platform
 
     if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
-        os.environ['TORCH_COMPILE_DISABLE'] = 'True'
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
     if current_platform.is_hpu():
         # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
         # does not support torch.compile
@@ -72,7 +72,6 @@ def load_general_plugins():
         # torch.compile support
         is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
         if is_lazy:
-            # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
             torch._dynamo.config.disable = True
             # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
             # requires enabling lazy collectives

From ca47e176af9e0a4fa9f02325cdad5f11b40aedab Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:04:46 +0800
Subject: [PATCH 20/55] [Misc] Move some model utils into vision file (#11848)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/clip.py     |  5 +-
 vllm/model_executor/models/pixtral.py  |  5 +-
 vllm/model_executor/models/qwen2_vl.py |  3 +-
 vllm/model_executor/models/siglip.py   |  5 +-
 vllm/model_executor/models/utils.py    | 37 +-----------
 vllm/model_executor/models/vision.py   | 83 +++++++++++++++++++++++++-
 vllm/multimodal/inputs.py              |  4 +-
 vllm/multimodal/utils.py               | 44 --------------
 8 files changed, 94 insertions(+), 92 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 1bde45cb140cb..dd69f6c9a5aff 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -20,11 +20,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index b74bb3c8a3f88..37b9989e489ec 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -31,14 +31,13 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   resolve_visual_encoder_outputs)
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8537fec854b6d..76a810e8f0c20 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -66,8 +66,9 @@
 from vllm.transformers_utils.config import uses_mrope
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
+from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix)
+from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 7ea177e94afc0..cca42842bc06e 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -24,11 +24,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 4ed3b237ae0e2..43b3c973c97b8 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -8,16 +8,12 @@
 from torch.func import functional_call
 from transformers import PretrainedConfig
 
-import vllm.envs as envs
-from vllm.attention.selector import (backend_name_to_enum,
-                                     get_global_forced_attn_backend)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
-from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available, print_warning_once
+from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -612,37 +608,6 @@ def make_empty_intermediate_tensors(
     return make_empty_intermediate_tensors
 
 
-def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
-    """
-    Get the available attention backend for Vision Transformer.
-    """
-    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
-    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
-    if selected_backend is None:
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-    if selected_backend is None:
-        # For Volta and Turing GPUs, use xformers instead.
-        device_available = current_platform.has_device_capability(80)
-        if device_available and support_fa:
-            from transformers.utils import is_flash_attn_2_available
-            if is_flash_attn_2_available():
-                selected_backend = _Backend.FLASH_ATTN
-            else:
-                print_warning_once(
-                    "Current `vllm-flash-attn` has a bug inside vision module, "
-                    "so we use xformers backend instead. You can run "
-                    "`pip install flash-attn` to use flash-attention backend.")
-                selected_backend = _Backend.XFORMERS
-        elif current_platform.is_cpu() or current_platform.is_rocm():
-            # ROCM doesn't support xformers
-            selected_backend = _Backend.TORCH_SDPA
-        else:
-            selected_backend = _Backend.XFORMERS
-    return selected_backend
-
-
 def maybe_prefix(prefix: str, name: str) -> str:
     """Add a prefix to a name if the prefix is non-empty.
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 8516c9f7066f7..e6a9e153d9107 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,15 @@
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Protocol, TypeVar
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
+import torch
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
+from vllm.attention.selector import (backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import print_warning_once
+
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -60,3 +67,77 @@ def get_vision_encoder_info(
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
+
+
+def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
+    """
+    Get the available attention backend for Vision Transformer.
+    """
+    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
+    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+    if selected_backend is None:
+        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        if backend_by_env_var is not None:
+            selected_backend = backend_name_to_enum(backend_by_env_var)
+    if selected_backend is None:
+        # For Volta and Turing GPUs, use xformers instead.
+        device_available = current_platform.has_device_capability(80)
+        if device_available and support_fa:
+            from transformers.utils import is_flash_attn_2_available
+            if is_flash_attn_2_available():
+                selected_backend = _Backend.FLASH_ATTN
+            else:
+                print_warning_once(
+                    "Current `vllm-flash-attn` has a bug inside vision module, "
+                    "so we use xformers backend instead. You can run "
+                    "`pip install flash-attn` to use flash-attention backend.")
+                selected_backend = _Backend.XFORMERS
+        elif current_platform.is_cpu() or current_platform.is_rocm():
+            # ROCM doesn't support xformers
+            selected_backend = _Backend.TORCH_SDPA
+        else:
+            selected_backend = _Backend.XFORMERS
+    return selected_backend
+
+
+def resolve_visual_encoder_outputs(
+    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+    feature_sample_layers: Optional[list[int]],
+    post_layer_norm: Optional[torch.nn.LayerNorm],
+    max_possible_layers: int,
+) -> torch.Tensor:
+    """Given the outputs a visual encoder module that may correspond to the
+    output of the last layer, or a list of hidden states to be stacked,
+    handle post normalization and resolve it into a single output tensor.
+
+    Args:
+        encoder_outputs: Output of encoder's last layer or all hidden states.
+        feature_sample_layers: Optional layer indices to grab from the encoder
+            outputs; if provided, encoder outputs must be a list.
+        post_layer_norm: Post norm to apply to the output of the encoder.
+        max_possible_layers: Total layers in the fully loaded visual encoder.
+
+    """
+    if feature_sample_layers is None:
+        if post_layer_norm is not None:
+            return post_layer_norm(encoder_outputs)
+        return encoder_outputs
+
+    # Get the hidden states corresponding to the layer indices.
+    # Negative values are relative to the full visual encoder,
+    # so offset them depending on how many layers were loaded.
+    # NOTE: this assumes that encoder_outputs contains a list
+    # of hidden states in the same order as the encoder layers
+    # that produced them.
+    offset = max_possible_layers - len(encoder_outputs)
+    hs_pool = [
+        encoder_outputs[layer_idx]
+        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
+        for layer_idx in feature_sample_layers
+    ]
+
+    # Apply post-norm on the final hidden state if we are using it
+    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+    if post_layer_norm is not None and uses_last_layer:
+        hs_pool[-1] = post_layer_norm(encoder_outputs)
+    return torch.cat(hs_pool, dim=-1)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d542461874866..8680e4175593b 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,6 +99,8 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
+
+The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
@@ -485,7 +487,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
-A dictionary containing placeholder ranges.
+A dictionary containing placeholder ranges for each modality.
 """
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f4a514ba55d0c..1c6bbf77b926f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -5,7 +5,6 @@
 
 import numpy as np
 import numpy.typing as npt
-import torch
 from PIL import Image
 
 import vllm.envs as envs
@@ -285,49 +284,6 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-def resolve_visual_encoder_outputs(
-    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
-    feature_sample_layers: Optional[list[int]],
-    post_layer_norm: Optional[torch.nn.LayerNorm],
-    max_possible_layers: int,
-) -> torch.Tensor:
-    """Given the outputs a visual encoder module that may correspond to the
-    output of the last layer, or a list of hidden states to be stacked,
-    handle post normalization and resolve it into a single output tensor.
-
-    Args:
-        encoder_outputs: Output of encoder's last layer or all hidden states.
-        feature_sample_layers: Optional layer indices to grab from the encoder
-            outputs; if provided, encoder outputs must be a list.
-        post_layer_norm: Post norm to apply to the output of the encoder.
-        max_possible_layers: Total layers in the fully loaded visual encoder.
-
-    """
-    if feature_sample_layers is None:
-        if post_layer_norm is not None:
-            return post_layer_norm(encoder_outputs)
-        return encoder_outputs
-
-    # Get the hidden states corresponding to the layer indices.
-    # Negative values are relative to the full visual encoder,
-    # so offset them depending on how many layers were loaded.
-    # NOTE: this assumes that encoder_outputs contains a list
-    # of hidden states in the same order as the encoder layers
-    # that produced them.
-    offset = max_possible_layers - len(encoder_outputs)
-    hs_pool = [
-        encoder_outputs[layer_idx]
-        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
-        for layer_idx in feature_sample_layers
-    ]
-
-    # Apply post-norm on the final hidden state if we are using it
-    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
-    if post_layer_norm is not None and uses_last_layer:
-        hs_pool[-1] = post_layer_norm(encoder_outputs)
-    return torch.cat(hs_pool, dim=-1)
-
-
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 

From 5984499e473c387c17904aa9933b8ed080621ca6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:14:14 +0800
Subject: [PATCH 21/55] [Doc] Expand Multimodal API Reference (#11852)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/api/multimodal/index.md      | 61 ++++--------------------
 docs/source/api/multimodal/inputs.md     | 49 +++++++++++++++++++
 docs/source/api/multimodal/parse.md      |  9 ++++
 docs/source/api/multimodal/processing.md |  9 ++++
 docs/source/api/multimodal/profiling.md  |  9 ++++
 docs/source/api/multimodal/registry.md   |  9 ++++
 vllm/multimodal/parse.py                 | 31 ++++++++----
 vllm/multimodal/processing.py            | 26 +++++++---
 vllm/multimodal/profiling.py             |  7 ++-
 9 files changed, 139 insertions(+), 71 deletions(-)
 create mode 100644 docs/source/api/multimodal/inputs.md
 create mode 100644 docs/source/api/multimodal/parse.md
 create mode 100644 docs/source/api/multimodal/processing.md
 create mode 100644 docs/source/api/multimodal/profiling.md
 create mode 100644 docs/source/api/multimodal/registry.md

diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 0046b73ea825e..51e24795a34cf 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -2,10 +2,6 @@
 
 # Multi-Modality
 
-```{eval-rst}
-.. currentmodule:: vllm.multimodal
-```
-
 vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
 
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
@@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-
 ## Module Contents
 
-```{eval-rst}
-.. automodule:: vllm.multimodal
-```
-
-### Registry
-
 ```{eval-rst}
 .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 ```
 
-```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalRegistry
-    :members:
-    :show-inheritance:
-```
-
-### Base Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.base
-    :members:
-    :show-inheritance:
-```
+## Submodules
 
-### Input Classes
+```{toctree}
+:maxdepth: 1
 
-```{eval-rst}
-.. automodule:: vllm.multimodal.inputs
-    :members:
-    :show-inheritance:
-```
-
-### Audio Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.audio
-    :members:
-    :show-inheritance:
-```
-
-### Image Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.image
-    :members:
-    :show-inheritance:
-```
-
-### Video Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.video
-    :members:
-    :show-inheritance:
+inputs
+parse
+processing
+profiling
+registry
 ```
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
new file mode 100644
index 0000000000000..3d89666113229
--- /dev/null
+++ b/docs/source/api/multimodal/inputs.md
@@ -0,0 +1,49 @@
+# Input Definitions
+
+## User-facing inputs
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.MultiModalDataDict
+```
+
+## Internal data structures
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.inputs.NestedTensors
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
+    :members:
+    :show-inheritance:
+```
diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md
new file mode 100644
index 0000000000000..4676139efe626
--- /dev/null
+++ b/docs/source/api/multimodal/parse.md
@@ -0,0 +1,9 @@
+# Data Parsing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.parse
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md
new file mode 100644
index 0000000000000..0d81c8d3966ee
--- /dev/null
+++ b/docs/source/api/multimodal/processing.md
@@ -0,0 +1,9 @@
+# Data Processing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.processing
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md
new file mode 100644
index 0000000000000..b455145212202
--- /dev/null
+++ b/docs/source/api/multimodal/profiling.md
@@ -0,0 +1,9 @@
+# Memory Profiling
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.profiling
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md
new file mode 100644
index 0000000000000..0737a4385cf32
--- /dev/null
+++ b/docs/source/api/multimodal/registry.md
@@ -0,0 +1,9 @@
+# Registry
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.registry
+    :members:
+    :member-order: bysource
+```
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 6be046ba77ca7..ccff0e857eec4 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -13,14 +13,16 @@
 
 from .audio import resample_audio
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
-                     ImageItem, ModalityData, MultiModalDataDict,
-                     NestedTensors, VideoItem)
+                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
 
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
+    """
+    Represents data items for a modality in :class:`MultiModalDataItems`.
+    """
 
     def __init__(self, data: _T, modality: str) -> None:
         super().__init__()
@@ -69,6 +71,7 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+    """Base class for data items that are arranged in a list."""
 
     def get_count(self) -> int:
         return len(self.data)
@@ -83,7 +86,12 @@ def get_passthrough_data(self) -> Mapping[str, object]:
         return {}
 
 
-class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
+                                       torch.Tensor]):
+    """
+    Base class for data items that are expressed as a batched embedding tensor,
+    or a list of embedding tensors (one per item).
+    """
 
     def get_count(self) -> int:
         return len(self.data)
@@ -109,7 +117,7 @@ def __init__(self, data: Sequence[HfAudioItem]) -> None:
 
 class AudioEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "audio")
 
 
@@ -137,7 +145,7 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
 class ImageEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "image")
 
 
@@ -163,7 +171,7 @@ def get_frame_size(self, item_idx: int) -> ImageSize:
 
 class VideoEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "video")
 
 
@@ -172,8 +180,8 @@ def __init__(self, data: NestedTensors) -> None:
 
 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
+    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    such that each entry corresponds to a list.
     """
 
     def get_count(self, modality: str, *, strict: bool = True) -> int:
@@ -226,7 +234,8 @@ def get_items(
 
 class MultiModalDataParser:
     """
-    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
+    :class:`MultiModalDataItems`.
 
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
@@ -238,7 +247,9 @@ def __init__(self, *, target_sr: Optional[float] = None) -> None:
 
         self.target_sr = target_sr
 
-    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+    def _is_embeddings(
+            self, data: object
+    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c6a30cacebdd1..07d883d5d7295 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -33,20 +33,24 @@
 
 @dataclass
 class PromptReplacement:
+    """
+    Defines how to replace portions of an input prompt with placeholder tokens.
+    """
+
     modality: str
     """The modality for which the replacement is made."""
 
     target: _PromptSeq
-    """The text or token sequence to find and replace."""
+    """The token sequence (or text) to find and replace."""
 
     replacement: Union[Callable[[int], _PromptSeq],
                        _PromptSeq] = field(repr=False)
     """
-    Given the index of the processed item within :attr:`modality`, output the
-    replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`,
+    output the replacement token sequence (or text).
 
-    For convenience, you can pass in the replacement instead of a function
-    if it does not depend on the input.
+    For convenience, you can directly pass in the replacement token sequence
+    (or text) instead of a function if it does not depend on the input.
     """
 
     def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
@@ -132,6 +136,11 @@ def token_ids(self) -> list[int]:
 
 @dataclass
 class BoundPromptReplacement:
+    """
+    A :class:`PromptReplacement` bound to a tokenizer to automatically
+    convert :attr:`target` and the result of :meth:`get_replacement` between
+    token sequence and text representations.
+    """
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -144,6 +153,7 @@ def __post_init__(self) -> None:
 
     @property
     def target(self) -> _BoundPromptSequence:
+        """The token sequence (or text) to find and replace."""
         target = self._target
 
         return _BoundPromptSequence(
@@ -153,6 +163,10 @@ def target(self) -> _BoundPromptSequence:
         )
 
     def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        """
+        Given the index of the processed item within :attr:`modality`,
+        output the replacement token sequence (or text).
+        """
         replacement = self._replacement
         if callable(replacement):
             cache_key = item_idx
@@ -528,7 +542,7 @@ def put(
 
 
 class BaseProcessingInfo:
-    """Base class containing information to perform processing."""
+    """Base class to provide the information necessary for data processing."""
 
     def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ac3a6bcf3ddd..6f7da1509990f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -19,7 +19,10 @@
 
 @dataclass
 class ProcessorInputs:
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
+    """
+    Represents the keyword arguments to
+    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    """
     prompt_text: str
     mm_data: MultiModalDataDict
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
@@ -47,7 +50,7 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
-        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 

From 47de8821d3cdd32fce7df6312318223aee591fd2 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Thu, 9 Jan 2025 02:21:30 +0800
Subject: [PATCH 22/55] [Misc]add some explanations for BlockHashType (#11847)

---
 vllm/v1/core/kv_cache_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 84ff48bf428a0..22a5d2fb08a48 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -11,8 +11,10 @@
 
 class BlockHashType(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
-    The reason we keep a tuple of token IDs and extra keys is to make sure
-    no hash collision happens when the hash value is the same.
+    We keep a tuple of token IDs and extra keys to reduce the likelihood of
+    hash collisions when the hash value is the same. But please note that 
+    hash collisions can still theoretically occur, albeit with an extremely 
+    low probability.
     """
     # Hash value of the block in an integer.
     hash_value: int

From 56fe4c297c7d9d872eccc19e3edbf1d75e1a30e2 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:33:29 -0500
Subject: [PATCH 23/55] [TPU][Quantization] TPU `W8A8` (#11785)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/run-tpu-test.sh                    |  11 +-
 tests/tpu/test_quantization_accuracy.py       |  49 +++++++
 .../schemes/compressed_tensors_w8a8_int8.py   | 105 ++++----------
 .../schemes/compressed_tensors_wNa16.py       |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/kernels/__init__.py   |  74 ----------
 .../{ => mixed_precision}/MPLinearKernel.py   |   0
 .../kernels/mixed_precision/__init__.py       |  74 ++++++++++
 .../kernels/{ => mixed_precision}/exllama.py  |   0
 .../kernels/{ => mixed_precision}/machete.py  |   0
 .../kernels/{ => mixed_precision}/marlin.py   |   0
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  64 +++++++++
 .../kernels/scaled_mm/__init__.py             |  84 +++++++++++
 .../quantization/kernels/scaled_mm/cutlass.py | 134 ++++++++++++++++++
 .../quantization/kernels/scaled_mm/xla.py     | 101 +++++++++++++
 .../layers/quantization/utils/w8a8_utils.py   |  38 -----
 vllm/model_executor/parameter.py              |  13 ++
 vllm/platforms/tpu.py                         |   4 +-
 18 files changed, 565 insertions(+), 190 deletions(-)
 create mode 100644 tests/tpu/test_quantization_accuracy.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/MPLinearKernel.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/exllama.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/machete.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/marlin.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 13605a3e97142..a8f021890f742 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,13 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
new file mode 100644
index 0000000000000..6cd5615c44e1e
--- /dev/null
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+
+
+@dataclass
+class GSM8KAccuracyTestConfig:
+    model_name: str
+    excepted_value: float
+
+    def get_model_args(self) -> str:
+        return (f"pretrained={self.model_name},"
+                "max_model_len=4096,max_num_seqs=32")
+
+
+# NOTE: Accuracy scores measured on GPUs.
+ACCURACY_CONFIGS = [
+    GSM8KAccuracyTestConfig(
+        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        excepted_value=0.76),  # no bias
+    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
+    # so only one of these tests can run in a single call to pytest. As
+    # a follow up, move this into the LM-EVAL section of the CI.
+    # GSM8KAccuracyTestConfig(
+    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
+    #     excepted_value=0.66),  # bias in QKV layers
+]
+
+
+@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
+def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=config.get_model_args(),
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    EXPECTED_VALUE = config.excepted_value
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 6cbc58d61e970..0e3f4731775c5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,14 +1,13 @@
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Set
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
-from torch.nn import Parameter
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_int8_linear, convert_to_channelwise)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            ModelWeightParameter,
@@ -18,6 +17,7 @@
 
 
 class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
+    _kernel_backends_being_used: Set[str] = set()
 
     def __init__(self, strategy: str, is_static_input_scheme: bool,
                  input_symmetric: bool):
@@ -30,74 +30,25 @@ def get_min_capability(cls) -> int:
         # turing and up
         return 75
 
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # WEIGHT
-        # Cutlass kernels need transposed weight.
-        weight = layer.weight
-        layer.weight = Parameter(weight.t(), requires_grad=False)
-
-        # WEIGHT SCALE
-        # Cutlass kernels support only per-tensor and per-channel.
-        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
-        # scales being passed to the kernel), convert to the per-channel case.
-        is_fused_module = len(self.logical_widths) > 1
-        if is_fused_module and self.strategy == QuantizationStrategy.TENSOR:
-            ws_channelwise = convert_to_channelwise(layer.weight_scale,
-                                                    self.logical_widths)
-            layer.weight_scale = Parameter(ws_channelwise, requires_grad=False)
-        else:
-            layer.weight_scale = Parameter(layer.weight_scale.data,
-                                           requires_grad=False)
-        # INPUT SCALE
-        if self.is_static_input_scheme:
-            if self.input_symmetric:
-                layer.input_scale = Parameter(layer.input_scale.max(),
-                                              requires_grad=False)
-                layer.input_zero_point = None
-            else:
-                # reconstruct the ranges
-                int8_traits = torch.iinfo(torch.int8)
-                azps = layer.input_zero_point.to(dtype=torch.int32)
-                range_max = (layer.input_scale *
-                             (int8_traits.max - azps)).max()
-                range_min = (layer.input_scale *
-                             (int8_traits.min - azps)).min()
-
-                scale = (range_max - range_min) / (int8_traits.max -
-                                                   int8_traits.min)
-                layer.input_scale = Parameter(scale, requires_grad=False)
-
-                # AZP loaded as int8 but used as int32
-                azp = (int8_traits.min -
-                       range_min / scale).to(dtype=torch.int32)
-                layer.input_zero_point = Parameter(azp, requires_grad=False)
-
-        else:
-            layer.input_scale = None
-            layer.input_zero_point = None
-
-        # azp_adj is the AZP adjustment term, used to account for weights.
-        # It does not depend on scales or azp, so it is the same for
-        # static and dynamic quantization.
-        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
-        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
-        if not self.input_symmetric:
-            azp_adj = layer.weight.sum(dim=0, keepdim=True, dtype=torch.int32)
-            if self.is_static_input_scheme:
-                # cutlass_w8a8 requires azp to be folded into azp_adj
-                #  in the per-tensor case
-                azp_adj = layer.input_zero_point * azp_adj
-
-            layer.azp_adj = azp_adj
-        else:
-            layer.azp_adj = None
-
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
-        self.logical_widths = output_partition_sizes
+        layer.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL),
+            is_static_input_scheme=self.is_static_input_scheme,
+            input_symmetric=self.input_symmetric)
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsW8A8Int8",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # WEIGHT
         weight = ModelWeightParameter(data=torch.empty(
@@ -140,12 +91,18 @@ def create_weights(self, layer: torch.nn.Module,
                     weight_loader=weight_loader)
                 layer.register_parameter("input_zero_point", input_zero_point)
 
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
-        return apply_int8_linear(input=x,
-                                 weight=layer.weight,
-                                 weight_scale=layer.weight_scale,
-                                 input_scale=layer.input_scale,
-                                 input_zero_point=layer.input_zero_point,
-                                 azp_adj=layer.azp_adj,
-                                 bias=bias)
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a515738017781..2dd243b9c3109 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -6,7 +6,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_repeat_scales_on_all_ranks)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a006d729cc627..2dbfca9b07690 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,7 +11,7 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
diff --git a/vllm/model_executor/layers/quantization/kernels/__init__.py b/vllm/model_executor/layers/quantization/kernels/__init__.py
index 94a3dc2584d6b..e69de29bb2d1d 100644
--- a/vllm/model_executor/layers/quantization/kernels/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/__init__.py
@@ -1,74 +0,0 @@
-from typing import List, Optional, Type
-
-import vllm.envs as envs
-from vllm.model_executor.layers.quantization.kernels.exllama import (
-    ExllamaLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.machete import (
-    MacheteLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.marlin import (
-    MarlinLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.MPLinearKernel import (
-    MPLinearKernel, MPLinearLayerConfig)
-from vllm.platforms import current_platform
-
-# in priority/performance order (when available)
-_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
-    MacheteLinearKernel,
-    MarlinLinearKernel,
-    ExllamaLinearKernel,
-]
-
-
-def choose_mp_linear_kernel(
-        config: MPLinearLayerConfig,
-        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
-    """
-    Choose an MPLinearKernel that can implement the given config for the given
-     compute capability. Attempts to choose the best kernel in terms of 
-     performance.
-
-    Args:
-        config (MPLinearLayerConfig): Description of the linear layer to be 
-          implemented.
-        compute_capability (Optional[int], optional): The compute capability of
-          the target device, if None uses `current_platform` to get the compute 
-          capability. Defaults to None.
-
-    Raises:
-        ValueError: If no kernel can implement the given config.
-
-    Returns:
-        Type[MPLinearKernel]: Chosen kernel.
-    """
-    if compute_capability is None:
-        if current_platform is None:
-            raise ValueError("Cannot determine compute capability")
-        _cc = current_platform.get_device_capability()
-        compute_capability = _cc[0] * 10 + _cc[1]
-
-    failure_reasons = []
-    for kernel in _POSSIBLE_KERNELS:
-        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
-            failure_reasons.append(
-                f' {kernel.__name__} disabled by environment variable')
-            continue
-
-        if kernel.get_min_capability() > compute_capability:
-            failure_reasons.append(
-                f"{kernel.__name__} requires capability "
-                f"{kernel.get_min_capability()}, current compute capability "
-                f"is {compute_capability}")
-            continue
-
-        can_implement, failure_reason = kernel.can_implement(config)
-        if can_implement:
-            return kernel
-        else:
-            failure_reasons.append(
-                f' {kernel.__name__} cannot implement due to: {failure_reason}'
-            )
-
-    raise ValueError(
-        "Failed to find a kernel that can implement the "\
-        "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
new file mode 100644
index 0000000000000..83549870e3f0b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -0,0 +1,74 @@
+from typing import List, Optional, Type
+
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
+    ExllamaLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
+    MacheteLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
+    MarlinLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
+    MPLinearKernel, MPLinearLayerConfig)
+from vllm.platforms import current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
+    MacheteLinearKernel,
+    MarlinLinearKernel,
+    ExllamaLinearKernel,
+]
+
+
+def choose_mp_linear_kernel(
+        config: MPLinearLayerConfig,
+        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
+    """
+    Choose an MPLinearKernel that can implement the given config for the given
+     compute capability. Attempts to choose the best kernel in terms of 
+     performance.
+
+    Args:
+        config (MPLinearLayerConfig): Description of the linear layer to be 
+          implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+          the target device, if None uses `current_platform` to get the compute 
+          capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[MPLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        if current_platform is None:
+            raise ValueError("Cannot determine compute capability")
+        _cc = current_platform.get_device_capability()
+        compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS:
+        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        if kernel.get_min_capability() > compute_capability:
+            failure_reasons.append(
+                f"{kernel.__name__} requires capability "
+                f"{kernel.get_min_capability()}, current compute capability "
+                f"is {compute_capability}")
+            continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "WNA16 linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/exllama.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
diff --git a/vllm/model_executor/layers/quantization/kernels/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/machete.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
diff --git a/vllm/model_executor/layers/quantization/kernels/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/marlin.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
new file mode 100644
index 0000000000000..75cf91f191136
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -0,0 +1,64 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+
+
+@dataclass
+class ScaledMMLinearLayerConfig:
+    is_channelwise: bool
+    is_static_input_scheme: bool
+    input_symmetric: bool
+
+
+class ScaledMMLinearKernel(ABC):
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        raise NotImplementedError
+
+    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
+                 w_s_param_name: str, i_s_param_name: str,
+                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        self.i_s_name = i_s_param_name
+        self.i_zp_name = i_zp_param_name
+        self.azp_adj_name = azp_adj_param_name
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+    def _get_weight_params(
+            self, layer: torch.nn.Module
+    ) -> Tuple[torch.Tensor,  # weight
+               torch.Tensor,  # weight_scale
+               Optional[torch.Tensor],  # input_scale, 
+               Optional[torch.Tensor],  # input_zp
+               Optional[torch.Tensor],  # azp_adj
+               ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.i_s_name),
+            getattr(layer, self.i_zp_name),
+            getattr(layer, self.azp_adj_name),
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
new file mode 100644
index 0000000000000..586752d3d34e3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -0,0 +1,84 @@
+import os
+from typing import Dict, List, Optional, Type
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+    CutlassScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+#     TritonScaledMMLinear)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
+    XLAScaledMMLinearKernel)
+from vllm.platforms import PlatformEnum, current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
+    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will
+    # incorrectly attempt to run AZP models if prompted to.
+    PlatformEnum.ROCM: [CutlassScaledMMLinearKernel],
+    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
+}
+
+
+def choose_scaled_mm_linear_kernel(
+        config: ScaledMMLinearLayerConfig,
+        compute_capability: Optional[int] = None
+) -> Type[ScaledMMLinearKernel]:
+    """
+    Choose an ScalledMMLinearKernel that can implement the given config for the 
+    given compute capability. Attempts to choose the best kernel in terms of 
+    performance.
+
+    Args:
+        config (ScaledMMLinearLayerConfig): Description of the linear layer 
+            to be implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+            the target device, if None uses `current_platform` to get the 
+            compute capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[ScaledMMLinearKernel]: Chosen kernel.
+    """
+
+    if compute_capability is None:
+        _cc = current_platform.get_device_capability()
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
+            .split(","):
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        # If the current platform uses compute_capability,
+        # make sure the kernel supports the compute cability.
+        if compute_capability is not None:
+            kernel_min_capability = kernel.get_min_capability()
+            if (kernel_min_capability is not None
+                    and kernel_min_capability > compute_capability):
+                failure_reasons.append(
+                    f"{kernel.__name__} requires capability "
+                    f"{kernel_min_capability}, current compute capability "
+                    f"is {compute_capability}")
+                continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "ScaledMM linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
new file mode 100644
index 0000000000000..2e83a04286a0d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -0,0 +1,134 @@
+from typing import Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
+            return False, "CutlassScaledMM requires running on CUDA or CPU."
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+
+                # AZP loaded as int8 but used as int32
+                azp = (int8_traits.min -
+                       range_min / scale).to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        # azp_adj is the AZP adjustment term, used to account for weights.
+        # It does not depend on scales or azp, so it is the same for
+        # static and dynamic quantization.
+        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        if not self.config.input_symmetric:
+            weight = getattr(layer, self.w_q_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.config.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                # in the per-tensor case
+                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        if x_zp is not None:
+            # Currently, static is always per-tensor and dynamic is per-token
+            static = i_zp is not None
+            azp = None if static else x_zp
+            return ops.cutlass_scaled_mm_azp(x_q,
+                                             w_q,
+                                             scale_a=x_s,
+                                             scale_b=w_s,
+                                             out_dtype=x.dtype,
+                                             azp_adj=azp_adj,
+                                             azp=azp,
+                                             bias=bias)
+        return ops.cutlass_scaled_mm(x_q,
+                                     w_q,
+                                     scale_a=x_s,
+                                     scale_b=w_s,
+                                     out_dtype=x.dtype,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
new file mode 100644
index 0000000000000..9de668e658826
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -0,0 +1,101 @@
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from functorch.experimental.control_flow import cond  # noqa: F401
+
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "TPU platform does have a concept of compute capability, "
+            "this method should not be called.")
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if not current_platform.is_tpu():
+            return False, "ScaledMMXLA requires running on TPU."
+
+        if c.is_static_input_scheme:
+            return False, "ScaledMMXLA requires dynamic activation scales."
+
+        if not c.input_symmetric:
+            return False, "ScaledMMXLA requires symmetric activation scales."
+
+        if not c.is_channelwise:
+            return False, "ScaledMMXLA requires channelwise weight scales"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # [out, in] (different than cutlass_scaled_mm)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(weight.data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # XLA kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+
+        # [out_channel,] (different than cutlass_scaled_mm)
+        weight_scale = weight_scale.squeeze(-1)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # Only support symmetric dynamic activation quantization.
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+
+        # Filter warning for cond usage in apply_weights. It is okay
+        # to specialize the graph since bias is not dynamic.
+        warnings.filterwarnings(
+            "ignore",
+            message=
+            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+        )
+
+    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x
+
+    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x + bias
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+
+        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        out = torch.ops.xla.quantized_matmul(x,
+                                             w_q,
+                                             w_s,
+                                             zero_point=None,
+                                             block_size=-1,
+                                             int4_weight=False,
+                                             quantize_activation=True)
+
+        # Explicitly capture control flow to make dynamo happy.
+        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
+        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index d89071f30a549..7cdce67cf1677 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -201,44 +201,6 @@ def apply_fp8_linear(
             return output.to(dtype=input.dtype).view(*output_shape)
 
 
-def apply_int8_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    input_zero_point: Optional[torch.Tensor] = None,
-    azp_adj: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-):
-    # ops.scaled_int8_quant supports both dynamic and static quant.
-    # * dynamic, layer.input_scale is None and x_scale computed from x.
-    # * static, layer.input_scale is scalar and x_scale is input_scale.
-    symmetric = azp_adj is None
-    x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
-                                               input_scale,
-                                               input_zero_point,
-                                               symmetric=symmetric)
-
-    if x_zp is not None:
-        # Currently, static is always per-tensor and dynamic is per-token
-        static = input_zero_point is not None
-        azp = None if static else x_zp
-        return ops.cutlass_scaled_mm_azp(x_q,
-                                         weight,
-                                         scale_a=x_scale,
-                                         scale_b=weight_scale,
-                                         out_dtype=input.dtype,
-                                         azp_adj=azp_adj,
-                                         azp=azp,
-                                         bias=bias)
-    return ops.cutlass_scaled_mm(x_q,
-                                 weight,
-                                 scale_a=x_scale,
-                                 scale_b=weight_scale,
-                                 out_dtype=input.dtype,
-                                 bias=bias)
-
-
 def normalize_e4m3fn_to_e4m3fnuz(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 02d22a5ca62c0..fc5a3e7fba674 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -6,6 +6,7 @@
 
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
+from vllm.model_executor.utils import _make_synced_weight_loader
 
 __all__ = [
     "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
@@ -37,6 +38,18 @@ def __init__(self, data: torch.Tensor, weight_loader: Callable):
         :returns: a torch.nn.parameter
         """
 
+        # During weight loading, we often do something like:
+        # narrowed_tensor = param.data.narrow(0, offset, len)
+        # narrowed_tensor.copy_(real_weight)
+        # expecting narrowed_tensor and param.data to share the same storage.
+        # However, on TPUs, narrowed_tensor will lazily propagate to the base
+        # tensor, which is param.data, leading to the redundant memory usage.
+        # This sometimes causes OOM errors during model loading. To avoid this,
+        # we sync the param tensor after its weight loader is called.
+        from vllm.platforms import current_platform
+        if current_platform.is_tpu():
+            weight_loader = _make_synced_weight_loader(weight_loader)
+
         self._weight_loader = weight_loader
 
     @property
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 77f5c8401424b..d488daf056f1a 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -19,7 +19,9 @@ class TpuPlatform(Platform):
     device_name: str = "tpu"
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
-    supported_quantization: list[str] = ["tpu_int8"]
+    supported_quantization: list[str] = [
+        "tpu_int8", "compressed-tensors", "compressed_tensors"
+    ]
 
     @classmethod
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:

From 526de822d501c792b051c864ba873a836d78d5bf Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Wed, 8 Jan 2025 14:23:15 -0600
Subject: [PATCH 24/55] [Kernel][Triton][AMD] Use block size heuristic for avg
 2.8x speedup for int8 models (#11698)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 .../compressed_tensors/triton_scaled_mm.py      | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 3ff162170f255..2659afcdc74a9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -128,7 +128,8 @@ def triton_scaled_mm(input: torch.Tensor,
                      bias: Optional[torch.Tensor] = None,
                      block_size_m: int = 32,
                      block_size_n: int = 32,
-                     block_size_k: int = 32) -> torch.Tensor:
+                     block_size_k: int = 32,
+                     use_heuristic=True) -> torch.Tensor:
     M, K = input.shape
     N = weight.shape[1]
 
@@ -152,6 +153,20 @@ def triton_scaled_mm(input: torch.Tensor,
 
     has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
 
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+
+    block_size_m, block_size_n, block_size_k = tile_shape
+
     block_size_sa = 1 if has_scalar(scale_a) else block_size_m
     block_size_sb = 1 if has_scalar(scale_b) else block_size_n
 

From 3db0cafdf1fe7f4cd7e41a145f78e8a568b4d63c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 8 Jan 2025 12:38:28 -0800
Subject: [PATCH 25/55] [Docs] Add Google Cloud Meetup (#11864)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 1f82229f39537..253a0bb913e37 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
+The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
+
+---
+
 *Latest News* 🔥
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).

From 615e4a54017136649db275b68932af80168781f8 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 8 Jan 2025 21:20:44 -0500
Subject: [PATCH 26/55] [CI] Turn on basic correctness tests for V1 (#10864)

---
 tests/basic_correctness/test_basic_correctness.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 1c2193bb17a55..31a101e48e026 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -44,7 +44,6 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
-@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])

From 1fe554bac32419a6d64a5c977849806a1efd9725 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Thu, 9 Jan 2025 00:05:43 -0300
Subject: [PATCH 27/55] treat do_lower_case in the same way as the
 sentence-transformers library (#11815)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/entrypoints/openai/test_serving_chat.py       | 1 +
 tests/models/embedding/language/test_embedding.py   | 1 +
 vllm/entrypoints/openai/serving_engine.py           | 5 +++++
 vllm/inputs/preprocess.py                           | 6 ++++++
 vllm/transformers_utils/tokenizer_group/__init__.py | 5 -----
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 97248f1150979..f431d1065e0eb 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -35,6 +35,7 @@ class MockModelConfig:
     logits_processor_pattern = None
     diff_sampling_param: Optional[dict] = None
     allowed_local_media_path: str = ""
+    encoder_config = None
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index f458ef5ef556d..7749806548cd9 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -15,6 +15,7 @@
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-large"),
         # [Encoder-decoder]
         pytest.param("intfloat/e5-mistral-7b-instruct",
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 319f869240036..88859255f202a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -160,6 +160,11 @@ def _normalize_prompt_text_to_input(
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         if truncate_prompt_tokens is None:
             encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
         else:
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 6ddc1eb76f10d..3e92d5821e645 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -190,6 +190,12 @@ def _tokenize_prompt(
             # on the task and language of their request. Also needed to avoid
             # appending an EOS token to the prompt which disrupts generation.
             add_special_tokens = False
+
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         return tokenizer.encode(request_id=request_id,
                                 prompt=prompt,
                                 lora_request=lora_request,
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index c0b3d2585a962..d400276796996 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -26,11 +26,6 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
                        trust_remote_code=model_config.trust_remote_code,
                        revision=model_config.tokenizer_revision)
 
-    if (model_config.encoder_config is not None
-            and "do_lower_case" in model_config.encoder_config):
-        init_kwargs["do_lower_case"] = model_config.encoder_config[
-            "do_lower_case"]
-
     return get_tokenizer_group(parallel_config.tokenizer_pool_config,
                                **init_kwargs)
 

From 730e9592e97c643474aa44e9d3dbe6f55c4b9ad9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 8 Jan 2025 22:37:48 -0500
Subject: [PATCH 28/55] [Doc] Recommend uv and python 3.12 for quickstart guide
 (#11849)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 docs/source/getting_started/quickstart.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 2808e1b386801..ea15d9ef065fa 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -15,10 +15,19 @@ This guide will help you quickly get started with vLLM to perform:
 ## Installation
 
 If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
-It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+
+```console
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+$ uv pip install vllm
+```
+
+You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
-$ conda create -n myenv python=3.10 -y
+$ conda create -n myenv python=3.12 -y
 $ conda activate myenv
 $ pip install vllm
 ```

From d848800e884f581eeed9f154d6c2aeb38eac24de Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 12:48:12 +0800
Subject: [PATCH 29/55] [Misc] Move `print_*_once` from utils to logger
 (#11298)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Co-authored-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml        |  1 +
 vllm/attention/backends/torch_sdpa.py         |  9 ++-
 vllm/attention/backends/xformers.py           |  8 ++-
 vllm/config.py                                |  9 ++-
 vllm/entrypoints/chat_utils.py                |  7 +--
 vllm/inputs/preprocess.py                     | 20 ++++---
 vllm/inputs/registry.py                       |  4 +-
 vllm/logger.py                                | 57 +++++++++++++++++--
 vllm/lora/peft_helper.py                      |  6 +-
 vllm/lora/punica_wrapper/punica_selector.py   |  8 ++-
 vllm/model_executor/custom_op.py              |  3 +-
 .../compressed_tensors_moe.py                 |  8 ++-
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/kv_cache.py           |  6 +-
 .../quantization/utils/marlin_utils_fp8.py    |  6 +-
 .../model_loader/weight_utils.py              |  8 +--
 vllm/model_executor/models/chameleon.py       |  6 +-
 vllm/model_executor/models/olmoe.py           |  6 +-
 vllm/model_executor/models/qwen2_moe.py       |  6 +-
 vllm/model_executor/models/vision.py          |  6 +-
 vllm/utils.py                                 | 12 ----
 21 files changed, 129 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ee768db63c96c..556b60d2fca12 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,6 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
+          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c14f7754596dd..ca1c4618615de 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,9 +13,12 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad, print_warning_once
+from vllm.logger import init_logger
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
+logger = init_logger(__name__)
+
 
 class TorchSDPABackend(AttentionBackend):
 
@@ -396,8 +399,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("Torch SPDA does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 694c7cc1bc36a..8c8ca8520a9db 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,7 +17,9 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.utils import print_warning_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class XFormersBackend(AttentionBackend):
@@ -385,8 +387,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("XFormers does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("XFormers does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/config.py b/vllm/config.py
index 6dabeb3861af2..19609085cc960 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -32,8 +32,7 @@
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, print_warning_once, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -314,7 +313,7 @@ def __init__(self,
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
-                print_warning_once(
+                logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
                     "XFORMERS backend. Disabling sliding window and capping "
@@ -2758,7 +2757,7 @@ def uuid(self):
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
-                print_warning_once(
+                logger.warning_once(
                     "Fusion enabled but reshape elimination disabled."
                     "RMSNorm + quant (fp8) fusion might not work")
 
@@ -3151,7 +3150,7 @@ def __post_init__(self):
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
-            print_warning_once(
+            logger.warning_once(
                 "Turing devices tensor cores do not support float32 matmul. "
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a492d5496e025..923c7459f6948 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -35,7 +35,6 @@
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -985,14 +984,14 @@ def apply_mistral_chat_template(
     **kwargs: Any,
 ) -> List[int]:
     if chat_template is not None:
-        print_warning_once(
+        logger.warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
     if "add_generation_prompt" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
             "so it will be ignored.")
     if "continue_final_message" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
             "so it will be ignored.")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3e92d5821e645..a738ffe18e3ae 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -10,7 +10,6 @@
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.utils import print_info_once, print_warning_once
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         '''
 
         if not self.model_config.is_encoder_decoder:
-            print_warning_once("Using None for decoder start token id because "
-                               "this is not an encoder/decoder model.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "this is not an encoder/decoder model.")
             return None
 
         if (self.model_config is None or self.model_config.hf_config is None):
-            print_warning_once("Using None for decoder start token id because "
-                               "model config is not available.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "model config is not available.")
             return None
 
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            print_warning_once("Falling back on <BOS> for decoder start token "
-                               "id because decoder start token id is not "
-                               "available.")
+            logger.warning_once(
+                "Falling back on <BOS> for decoder start token "
+                "id because decoder start token id is not "
+                "available.")
             dec_start_token_id = self.get_bos_token_id()
 
         return dec_start_token_id
@@ -231,7 +233,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            print_info_once(
+            logger.info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b22b3f1594f24..aad0dfab94a01 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
-                        print_warning_once, resolve_mm_processor_kwargs)
+                        resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
 from .parse import is_encoder_decoder_inputs
@@ -352,7 +352,7 @@ def dummy_data_for_profiling(
         num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
-                print_warning_once(
+                logger.warning_once(
                     f"Expected at least {seq_len} dummy encoder tokens for "
                     f"profiling, but found {len(num_tokens)} tokens instead.")
             else:
diff --git a/vllm/logger.py b/vllm/logger.py
index 538db0dcf19aa..cac174f7ba02a 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -4,11 +4,12 @@
 import logging
 import os
 import sys
-from functools import partial
+from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
 from os import path
-from typing import Dict, Optional
+from types import MethodType
+from typing import Any, Optional, cast
 
 import vllm.envs as envs
 
@@ -49,8 +50,44 @@
 }
 
 
+@lru_cache
+def _print_info_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, stacklevel=2)
+
+
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, stacklevel=2)
+
+
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the :class:`logging.Logger`
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+
+    def info_once(self, msg: str) -> None:
+        """
+        As :meth:`info`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_info_once(self, msg)
+
+    def warning_once(self, msg: str) -> None:
+        """
+        As :meth:`warning`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_warning_once(self, msg)
+
+
 def _configure_vllm_root_logger() -> None:
-    logging_config: Dict = {}
+    logging_config = dict[str, Any]()
 
     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
@@ -84,12 +121,22 @@ def _configure_vllm_root_logger() -> None:
         dictConfig(logging_config)
 
 
-def init_logger(name: str) -> Logger:
+def init_logger(name: str) -> _VllmLogger:
     """The main purpose of this function is to ensure that loggers are
     retrieved in such a way that we can be sure the root vllm logger has
     already been configured."""
 
-    return logging.getLogger(name)
+    logger = logging.getLogger(name)
+
+    methods_to_patch = {
+        "info_once": _print_info_once,
+        "warning_once": _print_warning_once,
+    }
+
+    for method_name, method in methods_to_patch.items():
+        setattr(logger, method_name, MethodType(method, logger))
+
+    return cast(_VllmLogger, logger)
 
 
 # The root logger is initialized when the module is imported.
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index ddd42ae93d290..dacfb9ebd1480 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -4,7 +4,9 @@
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
-from vllm.utils import print_info_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 @dataclass
@@ -42,7 +44,7 @@ def _validate_features(self):
     def __post_init__(self):
         self._validate_features()
         if self.use_rslora:
-            print_info_once("Loading LoRA weights trained with rsLoRA.")
+            logger.info_once("Loading LoRA weights trained with rsLoRA.")
             self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
         else:
             self.vllm_lora_scaling_factor = self.lora_alpha / self.r
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index cd64878d95ae3..9791d492d8e48 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,19 +1,21 @@
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_info_once
 
 from .punica_base import PunicaWrapperBase
 
+logger = init_logger(__name__)
+
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     if current_platform.is_cuda_alike():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        print_info_once("Using PunicaWrapperGPU.")
+        logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        print_info_once("Using PunicaWrapperHPU.")
+        logger.info_once("Using PunicaWrapperHPU.")
         return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index fddc8bad09ef5..401606e8c76f0 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -5,7 +5,6 @@
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -91,7 +90,7 @@ def enabled(cls) -> bool:
         compilation_config = get_current_vllm_config().compilation_config
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
-            print_warning_once(
+            logger.warning_once(
                 f"Custom op {cls.__name__} was not registered, "
                 f"which means it won't appear in the op registry. "
                 f"It will be enabled/disabled based on the global settings.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5fd6b017f444b..4fb8fd84e92d4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -16,7 +17,8 @@
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class GPTQMarlinState(Enum):
@@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "activation scales are None.")
             if (not all_close_1d(layer.w13_input_scale)
                     or not all_close_1d(layer.w2_input_scale)):
-                print_warning_once(
+                logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer. ")
+                    "for each layer.")
             layer.w13_input_scale = torch.nn.Parameter(
                 layer.w13_input_scale.max(), requires_grad=False)
             layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2fe22903a385b..a1be45a49e94a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,7 +28,6 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -539,10 +538,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "activation scales are None.")
                 if (not all_close_1d(layer.w13_input_scale)
                         or not all_close_1d(layer.w2_input_scale)):
-                    print_warning_once(
+                    logger.warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer. ")
+                        "for each layer.")
                 layer.w13_input_scale = torch.nn.Parameter(
                     layer.w13_input_scale.max(), requires_grad=False)
                 layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index d79536d196b92..a74f5415c8a51 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,8 +1,10 @@
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class BaseKVCacheMethod(QuantizeMethodBase):
@@ -67,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer._v_scale = v_scale
             if (layer._k_scale == 1.0 and layer._v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
-                print_warning_once(
+                logger.warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8b3dfaae971c3..245fe9238e421 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -3,11 +3,13 @@
 import torch
 
 import vllm._custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 
+logger = init_logger(__name__)
+
 
 def is_fp8_marlin_supported():
     return current_platform.has_device_capability(80)
@@ -47,7 +49,7 @@ def apply_fp8_marlin_linear(
 
 def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
                                  strategy: str = "tensor") -> None:
-    print_warning_once(
+    logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a2c991cfdb74e..11d5fd7135d9e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,7 @@
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import PlaceholderModule, print_warning_once
+from vllm.utils import PlaceholderModule
 
 try:
     from runai_model_streamer import SafetensorsStreamer
@@ -673,7 +673,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         None: If the remapped name is not found in params_dict.
     """
     if name.endswith(".kv_scale"):
-        print_warning_once(
+        logger.warning_once(
             "DEPRECATED. Found kv_scale in the checkpoint. "
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
@@ -682,7 +682,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
-            print_warning_once(
+            logger.warning_once(
                 f"Found kv_scale in the checkpoint (e.g. {name}), "
                 "but not found the expected name in the model "
                 f"(e.g. {remapped_name}). kv_scale is "
@@ -695,7 +695,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         if name.endswith(scale_name):
             remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
-                print_warning_once(
+                logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
                     "but not found the expected name in the model "
                     f"(e.g. {remapped_name}). {scale_name} is "
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index acff926891bbe..452fe727875fe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,6 +11,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -35,13 +36,14 @@
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
+logger = init_logger(__name__)
+
 
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -1111,7 +1113,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. "
                                 f"{name}), but not found the expected name in "
                                 f"the model (e.g. {remapped_kv_scale_name}). "
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 5d9091cfb9311..fbe5d1aee04b3 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -20,6 +20,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -34,13 +35,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class OlmoeMoE(nn.Module):
     """A tensor-parallel MoE implementation for Olmoe that shards each expert
@@ -446,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index ba70243c6533d..95de6c21871bf 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,6 +34,7 @@
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -50,13 +51,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class Qwen2MoeMLP(nn.Module):
 
@@ -524,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index e6a9e153d9107..a1395982af44c 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -7,8 +7,10 @@
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
+from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
 
@@ -87,7 +89,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                print_warning_once(
+                logger.warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")
diff --git a/vllm/utils.py b/vllm/utils.py
index c09cae70e9af8..a92b77efd9fd8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -696,18 +696,6 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache
-def print_info_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.info(msg, stacklevel=2)
-
-
-@lru_cache
-def print_warning_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.warning(msg, stacklevel=2)
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform

From a732900efc4eb0d4393e3885d5df8ef3516d4834 Mon Sep 17 00:00:00 2001
From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:39:39 +0700
Subject: [PATCH 30/55] [Doc] Intended links Python multiprocessing library
 (#11878)

---
 docs/source/design/multiprocessing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index da87638e5b743..c2cdb75ea08a7 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges.
 
 ## Multiprocessing Methods
 
-[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include:
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
   3.14.

From 310aca88c984983189a57f1b72e3b1dde89fb92f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 15:18:21 +0800
Subject: [PATCH 31/55] [perf]fix current stream (#11870)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../device_communicators/pynccl.py            | 15 +++++----
 vllm/distributed/parallel_state.py            |  5 +--
 vllm/utils.py                                 | 33 +++++++++++++++++++
 vllm/worker/multi_step_model_runner.py        |  8 ++---
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index fda4d007ceb5b..efc59987195f5 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -10,6 +10,7 @@
     ncclRedOpTypeEnum, ncclUniqueId)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
+from vllm.utils import current_stream
 
 logger = init_logger(__name__)
 
@@ -96,7 +97,7 @@ def __init__(
             self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                 self.world_size, self.unique_id, self.rank)
 
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
             # A small all_reduce for warmup.
             data = torch.zeros(1, device=device)
             self.all_reduce(data)
@@ -119,7 +120,7 @@ def all_reduce(self,
         out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
                                 buffer_type(out_tensor.data_ptr()),
                                 in_tensor.numel(),
@@ -141,7 +142,7 @@ def all_gather(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
@@ -162,7 +163,7 @@ def reduce_scatter(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
@@ -177,7 +178,7 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -189,7 +190,7 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -201,7 +202,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         if src == self.rank:
             sendbuff = buffer_type(tensor.data_ptr())
             # NCCL requires the sender also to have a receive buffer
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a837c1dc5953b..be7f16ef52a47 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -357,10 +357,7 @@ def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
             return out
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
-        # TODO: pynccl should not use `stream=`
-        # it can just always use the current stream.
-        out = pynccl_comm.all_reduce(input_,
-                                     stream=torch.cuda.current_stream())
+        out = pynccl_comm.all_reduce(input_)
         if out is None:
             # fall back to the default all-reduce using PyTorch.
             # this usually happens during testing.
diff --git a/vllm/utils.py b/vllm/utils.py
index a92b77efd9fd8..0b0905e675245 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -944,6 +944,39 @@ def find_nccl_library() -> str:
     return so_file
 
 
+prev_set_stream = torch.cuda.set_stream
+
+_current_stream = None
+
+
+def _patched_set_stream(stream: torch.cuda.Stream) -> None:
+    global _current_stream
+    _current_stream = stream
+    prev_set_stream(stream)
+
+
+torch.cuda.set_stream = _patched_set_stream
+
+
+def current_stream() -> torch.cuda.Stream:
+    """
+    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
+    it turns out that `torch.cuda.current_stream()` is quite expensive,
+    as it will construct a new stream object at each call.
+    here we patch `torch.cuda.set_stream` to keep track of the current stream
+    directly, so that we can avoid calling `torch.cuda.current_stream()`.
+
+    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
+    from C/C++ code.
+    """
+    global _current_stream
+    if _current_stream is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _current_stream = torch.cuda.current_stream()
+    return _current_stream
+
+
 def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index a2c2cebf8d1f6..acce923498d7e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -14,7 +14,7 @@
                                                 get_pythonized_sample_results)
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.utils import PyObjectCache, async_tensor_h2d
+from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
@@ -498,7 +498,7 @@ def execute_model(
         #   appended sampler output from last iteration
         #   - also maybe pythonize if CPU is ahead of GPU
 
-        current_stream = torch.cuda.current_stream()
+        stream = current_stream()
         if not model_input.is_first_multi_step:
             # Explicitly block on the previous step's forward to make sure we
             # don't clobber any GPU tensors still in use.
@@ -541,7 +541,7 @@ def execute_model(
                                                        num_steps=1)
 
         # record the event for the current step so that the next step can sync
-        model_input.record_step_event(current_stream)
+        model_input.record_step_event(stream)
 
         if get_pp_group().is_last_rank and self.is_driver_worker:
             assert isinstance(output, list)
@@ -552,7 +552,7 @@ def execute_model(
             # event for the pythonization so that we only pythonize if the
             # tensors are ready. May be able to be combined with the step event
             output_ready_event = torch.cuda.Event()
-            output_ready_event.record(current_stream)
+            output_ready_event.record(stream)
             if self.parallel_config.pipeline_parallel_size > 1:
                 output[0].sampled_token_ids_cpu = output[
                     0].sampled_token_ids.cpu()

From 0bd1ff43469f867f92786a3596c3e4a64df43400 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:02:53 +0800
Subject: [PATCH 32/55] [Bugfix] Override dunder methods of placeholder modules
 (#11882)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_utils.py |  47 ++++++++++-
 vllm/utils.py       | 189 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 220 insertions(+), 16 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0285b00d73be1..14d2fbd63b90d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,9 @@
 import torch
 from vllm_test_utils import monitor
 
-from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, memory_profiling, merge_async_iterators,
-                        supports_kw)
+from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
+                        StoreBoolean, deprecate_kwargs, get_open_port,
+                        memory_profiling, merge_async_iterators, supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -323,3 +323,44 @@ def measure_current_non_torch():
     del weights
     lib.cudaFree(handle1)
     lib.cudaFree(handle2)
+
+
+def test_placeholder_module_error_handling():
+    placeholder = PlaceholderModule("placeholder_1234")
+
+    def build_ctx():
+        return pytest.raises(ModuleNotFoundError,
+                             match="No module named")
+
+    with build_ctx():
+        int(placeholder)
+
+    with build_ctx():
+        placeholder()
+
+    with build_ctx():
+        _ = placeholder.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __name attribute
+        _ = placeholder.name
+
+    # OK to print the placeholder or use it in a f-string
+    _ = repr(placeholder)
+    _ = str(placeholder)
+
+    # No error yet; only error when it is used downstream
+    placeholder_attr = placeholder.placeholder_attr("attr")
+
+    with build_ctx():
+        int(placeholder_attr)
+
+    with build_ctx():
+        placeholder_attr()
+
+    with build_ctx():
+        _ = placeholder_attr.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __module attribute
+        _ = placeholder_attr.module
diff --git a/vllm/utils.py b/vllm/utils.py
index 0b0905e675245..487088591ebc2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -46,7 +46,7 @@
 import zmq.asyncio
 from packaging.version import Version
 from torch.library import Library
-from typing_extensions import ParamSpec, TypeIs, assert_never
+from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
@@ -1627,24 +1627,183 @@ def get_vllm_optional_dependencies():
     }
 
 
-@dataclass(frozen=True)
-class PlaceholderModule:
+class _PlaceholderBase:
+    """
+    Disallows downstream usage of placeholder modules.
+
+    We need to explicitly override each dunder method because
+    :meth:`__getattr__` is not called when they are accessed.
+
+    See also:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    """
+
+    def __getattr__(self, key: str) -> Never:
+        """
+        The main class should implement this to throw an error
+        for attribute accesses representing downstream usage.
+        """
+        raise NotImplementedError
+
+    # [Basic customization]
+
+    def __lt__(self, other: object):
+        return self.__getattr__("__lt__")
+
+    def __le__(self, other: object):
+        return self.__getattr__("__le__")
+
+    def __eq__(self, other: object):
+        return self.__getattr__("__eq__")
+
+    def __ne__(self, other: object):
+        return self.__getattr__("__ne__")
+
+    def __gt__(self, other: object):
+        return self.__getattr__("__gt__")
+
+    def __ge__(self, other: object):
+        return self.__getattr__("__ge__")
+
+    def __hash__(self):
+        return self.__getattr__("__hash__")
+
+    def __bool__(self):
+        return self.__getattr__("__bool__")
+
+    # [Callable objects]
+
+    def __call__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__call__")
+
+    # [Container types]
+
+    def __len__(self):
+        return self.__getattr__("__len__")
+
+    def __getitem__(self, key: object):
+        return self.__getattr__("__getitem__")
+
+    def __setitem__(self, key: object, value: object):
+        return self.__getattr__("__setitem__")
+
+    def __delitem__(self, key: object):
+        return self.__getattr__("__delitem__")
+
+    # __missing__ is optional according to __getitem__ specification,
+    # so it is skipped
+
+    # __iter__ and __reversed__ have a default implementation
+    # based on __len__ and __getitem__, so they are skipped.
+
+    # [Numeric Types]
+
+    def __add__(self, other: object):
+        return self.__getattr__("__add__")
+
+    def __sub__(self, other: object):
+        return self.__getattr__("__sub__")
+
+    def __mul__(self, other: object):
+        return self.__getattr__("__mul__")
+
+    def __matmul__(self, other: object):
+        return self.__getattr__("__matmul__")
+
+    def __truediv__(self, other: object):
+        return self.__getattr__("__truediv__")
+
+    def __floordiv__(self, other: object):
+        return self.__getattr__("__floordiv__")
+
+    def __mod__(self, other: object):
+        return self.__getattr__("__mod__")
+
+    def __divmod__(self, other: object):
+        return self.__getattr__("__divmod__")
+
+    def __pow__(self, other: object, modulo: object = ...):
+        return self.__getattr__("__pow__")
+
+    def __lshift__(self, other: object):
+        return self.__getattr__("__lshift__")
+
+    def __rshift__(self, other: object):
+        return self.__getattr__("__rshift__")
+
+    def __and__(self, other: object):
+        return self.__getattr__("__and__")
+
+    def __xor__(self, other: object):
+        return self.__getattr__("__xor__")
+
+    def __or__(self, other: object):
+        return self.__getattr__("__or__")
+
+    # r* and i* methods have lower priority than
+    # the methods for left operand so they are skipped
+
+    def __neg__(self):
+        return self.__getattr__("__neg__")
+
+    def __pos__(self):
+        return self.__getattr__("__pos__")
+
+    def __abs__(self):
+        return self.__getattr__("__abs__")
+
+    def __invert__(self):
+        return self.__getattr__("__invert__")
+
+    # __complex__, __int__ and __float__ have a default implementation
+    # based on __index__, so they are skipped.
+
+    def __index__(self):
+        return self.__getattr__("__index__")
+
+    def __round__(self, ndigits: object = ...):
+        return self.__getattr__("__round__")
+
+    def __trunc__(self):
+        return self.__getattr__("__trunc__")
+
+    def __floor__(self):
+        return self.__getattr__("__floor__")
+
+    def __ceil__(self):
+        return self.__getattr__("__ceil__")
+
+    # [Context managers]
+
+    def __enter__(self):
+        return self.__getattr__("__enter__")
+
+    def __exit__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__exit__")
+
+
+class PlaceholderModule(_PlaceholderBase):
     """
     A placeholder object to use when a module does not exist.
 
     This enables more informative errors when trying to access attributes
     of a module that does not exists.
     """
-    name: str
+
+    def __init__(self, name: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__name = name
 
     def placeholder_attr(self, attr_path: str):
         return _PlaceholderModuleAttr(self, attr_path)
 
     def __getattr__(self, key: str):
-        name = self.name
+        name = self.__name
 
         try:
-            importlib.import_module(self.name)
+            importlib.import_module(name)
         except ImportError as exc:
             for extra, names in get_vllm_optional_dependencies().items():
                 if name in names:
@@ -1657,17 +1816,21 @@ def __getattr__(self, key: str):
                              "when the original module can be imported")
 
 
-@dataclass(frozen=True)
-class _PlaceholderModuleAttr:
-    module: PlaceholderModule
-    attr_path: str
+class _PlaceholderModuleAttr(_PlaceholderBase):
+
+    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__module = module
+        self.__attr_path = attr_path
 
     def placeholder_attr(self, attr_path: str):
-        return _PlaceholderModuleAttr(self.module,
-                                      f"{self.attr_path}.{attr_path}")
+        return _PlaceholderModuleAttr(self.__module,
+                                      f"{self.__attr_path}.{attr_path}")
 
     def __getattr__(self, key: str):
-        getattr(self.module, f"{self.attr_path}.{key}")
+        getattr(self.__module, f"{self.__attr_path}.{key}")
 
         raise AssertionError("PlaceholderModule should not be used "
                              "when the original module can be imported")

From 1d967acb45d5d18434409b822f105f087e379eee Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <ye.charlotte.qi@gmail.com>
Date: Thu, 9 Jan 2025 01:36:39 -0800
Subject: [PATCH 33/55] [Bugfix] fix beam search input errors and latency
 benchmark script (#11875)

Signed-off-by: Ye Qi <yeq@meta.com>
Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com>
---
 benchmarks/benchmark_latency.py | 23 +++++++++++++++++------
 vllm/entrypoints/llm.py         | 10 ++++++----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e669ce4db299d..77c4f6aa927e4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -13,6 +13,7 @@
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -40,6 +41,20 @@ def main(args: argparse.Namespace):
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ))
+
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
             with torch.profiler.profile(
@@ -49,15 +64,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     ],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
                         str(profile_dir))) as p:
-                llm.generate(dummy_prompts,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
+                llm_generate()
             print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm_generate()
             end_time = time.perf_counter()
             latency = end_time - start_time
             return latency
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e48fd1a4fa5e9..acb4db85632a8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -21,7 +21,7 @@
                                          parse_chat_messages,
                                          resolve_chat_template_content_format)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
-from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
@@ -457,7 +457,7 @@ def generate(
 
     def beam_search(
         self,
-        prompts: List[Union[str, List[int]]],
+        prompts: List[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
     ) -> List[BeamSearchOutput]:
         """
@@ -493,8 +493,10 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
         instances: List[BeamSearchInstance] = []
 
         for prompt in prompts:
-            prompt_tokens = prompt if isinstance(
-                prompt, list) else tokenizer.encode(prompt)
+            if is_token_prompt(prompt):
+                prompt_tokens = prompt["prompt_token_ids"]
+            else:
+                prompt_tokens = tokenizer.encode(prompt["prompt"])
             instances.append(BeamSearchInstance(prompt_tokens))
 
         for _ in range(max_tokens):

From 65097ca0af5c1d7caa3d9d8224fa8b4790a5f7bc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:43:40 +0800
Subject: [PATCH 34/55] [Doc] Add model development API Reference (#11884)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                      |  2 +-
 docs/source/api/{params.md => inference_params.md} |  5 ++---
 docs/source/api/model/adapters.md                  |  9 +++++++++
 docs/source/api/model/index.md                     | 12 ++++++++++++
 docs/source/api/model/interfaces.md                |  9 +++++++++
 docs/source/api/model/interfaces_base.md           |  9 +++++++++
 docs/source/index.md                               |  3 ++-
 vllm/model_executor/models/interfaces.py           | 11 +++++++----
 vllm/model_executor/models/interfaces_base.py      |  3 +++
 9 files changed, 54 insertions(+), 9 deletions(-)
 rename docs/source/api/{params.md => inference_params.md} (79%)
 create mode 100644 docs/source/api/model/adapters.md
 create mode 100644 docs/source/api/model/index.md
 create mode 100644 docs/source/api/model/interfaces.md
 create mode 100644 docs/source/api/model/interfaces_base.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f883595f6d9ad..e288f8f30159a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/params.html
+  - grep \"sig sig-object py\" build/html/api/inference_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/docs/source/api/params.md b/docs/source/api/inference_params.md
similarity index 79%
rename from docs/source/api/params.md
rename to docs/source/api/inference_params.md
index a3b4d9cbb44ec..181c30cab9c4a 100644
--- a/docs/source/api/params.md
+++ b/docs/source/api/inference_params.md
@@ -1,6 +1,6 @@
-# Optional Parameters
+# Inference Parameters
 
-Optional parameters for vLLM APIs.
+Inference parameters for vLLM APIs.
 
 (sampling-params)=
 
@@ -19,4 +19,3 @@ Optional parameters for vLLM APIs.
 .. autoclass:: vllm.PoolingParams
     :members:
 ```
-
diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md
new file mode 100644
index 0000000000000..e103a51d0070d
--- /dev/null
+++ b/docs/source/api/model/adapters.md
@@ -0,0 +1,9 @@
+# Model Adapters
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.adapters
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
new file mode 100644
index 0000000000000..b8437e3c3517a
--- /dev/null
+++ b/docs/source/api/model/index.md
@@ -0,0 +1,12 @@
+# Model Development
+
+## Submodules
+
+```{toctree}
+:maxdepth: 1
+
+interfaces_base
+interfaces
+adapters
+```
+
diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md
new file mode 100644
index 0000000000000..55bee57f64faa
--- /dev/null
+++ b/docs/source/api/model/interfaces.md
@@ -0,0 +1,9 @@
+# Optional Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md
new file mode 100644
index 0000000000000..75d58d34228e9
--- /dev/null
+++ b/docs/source/api/model/interfaces_base.md
@@ -0,0 +1,9 @@
+# Base Model Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces_base
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/index.md b/docs/source/index.md
index 6747a7fcce4fe..23e4304fe29d9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -139,8 +139,9 @@ community/sponsors
 
 api/offline_inference/index
 api/engine/index
+api/inference_params
 api/multimodal/index
-api/params
+api/model/index
 ```
 
 % Design Documents: Details about vLLM internals
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f26603046483..b51cba86ec1a4 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -38,13 +38,15 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
         to be merged with text embeddings.
 
         The output embeddings must be one of the following formats:
+    
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input multimodal data item (e.g, image).
+            each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
-        NOTE: The returned multimodal embeddings must be in the same order as 
-        the appearances of their corresponding multimodal data item in the 
-        input prompt.
+        Note:
+            The returned multimodal embeddings must be in the same order as 
+            the appearances of their corresponding multimodal data item in the 
+            input prompt.
         """
         ...
 
@@ -59,6 +61,7 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         ...
 
+    @overload
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index de733b6d49a53..4c353ae6ffc13 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -35,6 +35,7 @@
 
 @runtime_checkable
 class VllmModel(Protocol[C_co, T_co]):
+    """The interface required for all models in vLLM."""
 
     def __init__(
         self,
@@ -97,6 +98,7 @@ def is_vllm_model(
 
 @runtime_checkable
 class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all generative models in vLLM."""
 
     def compute_logits(
         self,
@@ -142,6 +144,7 @@ def is_text_generation_model(
 
 @runtime_checkable
 class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all pooling models in vLLM."""
 
     def pooler(
         self,

From 405eb8e3967eb9bd263b3919796cb3b45a2931d3 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Thu, 9 Jan 2025 21:46:50 +0800
Subject: [PATCH 35/55] [platform] Allow platform specify attention backend
 (#11609)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 tests/kernels/test_attention_selector.py |  74 ++++++------
 vllm/attention/selector.py               | 139 ++---------------------
 vllm/platforms/cpu.py                    |   7 +-
 vllm/platforms/cuda.py                   |  77 ++++++++++++-
 vllm/platforms/hpu.py                    |   7 +-
 vllm/platforms/interface.py              |   8 +-
 vllm/platforms/openvino.py               |   7 +-
 vllm/platforms/rocm.py                   |   6 +-
 vllm/platforms/tpu.py                    |   7 +-
 vllm/platforms/xpu.py                    |   7 +-
 10 files changed, 164 insertions(+), 175 deletions(-)

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 916cc2efa3895..a08c874407e3f 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,10 +1,10 @@
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import pytest
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from vllm.attention.selector import which_attn_to_use
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
@@ -12,6 +12,13 @@
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
@@ -24,67 +31,70 @@ def test_env(name: str, device: str, monkeypatch):
 
     if device == "cpu":
         with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "TORCH_SDPA"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "TORCH_SDPA"
     elif device == "hip":
         with patch("vllm.attention.selector.current_platform", RocmPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "ROCM_FLASH"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
-                   OpenVinoPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "OPENVINO"
+                   OpenVinoPlatform()), patch.dict('sys.modules',
+                                                   {'openvino': Mock()}):
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == name
+        if name in ["XFORMERS", "FLASHINFER"]:
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == name
 
 
 def test_flash_attn(monkeypatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # which_attn_to_use
+    # get_attn_backend
 
     override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
 
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported data type
-    backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported kv cache data type
-    backend = which_attn_to_use(16, torch.float16, "fp8", 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported block size
-    backend = which_attn_to_use(16, torch.float16, None, 8, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, None, 8, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # flash-attn is not installed
     with patch.dict('sys.modules', {'vllm_flash_attn': None}):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported head size
-    backend = which_attn_to_use(17, torch.float16, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(17, torch.float16, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Attention-free models should bypass env and use PlaceholderAttention
-    backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
 def test_invalid_env(monkeypatch):
     """Throw an exception if the backend name is invalid."""
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
     with pytest.raises(ValueError):
-        which_attn_to_use(16, torch.float16, None, 16, False)
+        get_attn_backend(16, torch.float16, None, 16, False)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index d263839705690..0ff007c87b1c9 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -9,7 +9,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
+from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -114,83 +114,19 @@ def _cached_get_attn_backend(
             BlocksparseFlashAttentionBackend)
         return BlocksparseFlashAttentionBackend
 
-    backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
-                                is_attention_free, use_v1)
-    if backend == _Backend.FLASH_ATTN:
-        logger.info("Using Flash Attention backend.")
-        from vllm.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend)
-        return FlashAttentionBackend
-    if backend == _Backend.FLASH_ATTN_VLLM_V1:
-        from vllm.v1.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend as FlashAttentionBackendV1)
-        return FlashAttentionBackendV1
-    if backend == _Backend.XFORMERS:
-        logger.info("Using XFormers backend.")
-        from vllm.attention.backends.xformers import (  # noqa: F401
-            XFormersBackend)
-        return XFormersBackend
-    elif backend == _Backend.ROCM_FLASH:
-        logger.info("Using ROCmFlashAttention backend.")
-        from vllm.attention.backends.rocm_flash_attn import (  # noqa: F401
-            ROCmFlashAttentionBackend)
-        return ROCmFlashAttentionBackend
-    elif backend == _Backend.TORCH_SDPA:
-        assert current_platform.is_cpu(), RuntimeError(
-            "Torch SDPA backend is only used for the CPU device.")
-        logger.info("Using Torch SDPA backend.")
-        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
-        return TorchSDPABackend
-    elif backend == _Backend.OPENVINO:
-        logger.info("Using OpenVINO Attention backend.")
-        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
-        return OpenVINOAttentionBackend
-    elif backend == _Backend.IPEX:
-        assert current_platform.is_xpu(), RuntimeError(
-            "IPEX attention backend is only used for the XPU device.")
-        logger.info("Using IPEX attention backend.")
-        from vllm.attention.backends.ipex_attn import IpexAttnBackend
-        return IpexAttnBackend
-    elif backend == _Backend.FLASHINFER:
-        logger.info("Using Flashinfer backend.")
-        from vllm.attention.backends.flashinfer import FlashInferBackend
-        return FlashInferBackend
-    elif backend == _Backend.HPU_ATTN:
-        logger.info("Using HPUAttention backend.")
-        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
-        return HPUAttentionBackend
-    elif backend == _Backend.PALLAS:
-        logger.info("Using Pallas backend.")
-        from vllm.attention.backends.pallas import PallasAttentionBackend
-        return PallasAttentionBackend
-    elif backend == _Backend.NO_ATTENTION:
-        from vllm.attention.backends.placeholder_attn import (
-            PlaceholderAttentionBackend)
-        return PlaceholderAttentionBackend
-    else:
-        raise ValueError("Invalid attention backend.")
-
-
-def which_attn_to_use(head_size: int,
-                      dtype: torch.dtype,
-                      kv_cache_dtype: Optional[str],
-                      block_size: int,
-                      is_attention_free: bool,
-                      use_v1: bool = False) -> _Backend:
-    """Returns which flash attention backend to use."""
-    # Default case.
-    selected_backend = _Backend.FLASH_ATTN
-
     # If there are no attention layers (e.g. we are running Mamba),
     # use the placeholder NO_ATTENTION
     if is_attention_free:
-        return _Backend.NO_ATTENTION
+        from vllm.attention.backends.placeholder_attn import (
+            PlaceholderAttentionBackend)
+        return PlaceholderAttentionBackend
 
     # Check whether a particular choice of backend was
     # previously forced.
     #
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
+    selected_backend = None
     backend_by_global_setting: Optional[_Backend] = (
         get_global_forced_attn_backend())
     if backend_by_global_setting is not None:
@@ -201,64 +137,13 @@ def which_attn_to_use(head_size: int,
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
 
-    # get device-specific default attn_backend
-    default_backend = current_platform.get_default_attn_backend(
-        selected_backend)
-    if default_backend is not None:
-        return default_backend
-
-    if use_v1:
-        return _Backend.FLASH_ATTN_VLLM_V1
-
-    # FlashAttn in NVIDIA GPUs.
-    if selected_backend == _Backend.FLASH_ATTN:
-        if not current_platform.has_device_capability(80):
-            # Volta and Turing NVIDIA GPUs.
-            logger.info(
-                "Cannot use FlashAttention-2 backend for Volta and Turing "
-                "GPUs.")
-            selected_backend = _Backend.XFORMERS
-        elif dtype not in (torch.float16, torch.bfloat16):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for dtype other than "
-                "torch.float16 or torch.bfloat16.")
-            selected_backend = _Backend.XFORMERS
-        elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable  "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
-            selected_backend = _Backend.XFORMERS
-        elif block_size % 16 != 0:
-            logger.info(
-                "Cannot use FlashAttention-2 backend for block size not "
-                "divisible by 16.")
-            selected_backend = _Backend.XFORMERS
-
-    # FlashAttn is valid for the model, checking if the package is installed.
-    if selected_backend == _Backend.FLASH_ATTN:
-        try:
-            import vllm.vllm_flash_attn  # noqa: F401
-            from vllm.attention.backends.flash_attn import (  # noqa: F401
-                FlashAttentionBackend)
-
-            supported_sizes = FlashAttentionBackend.get_supported_head_sizes()
-            if head_size not in supported_sizes:
-                logger.info(
-                    "Cannot use FlashAttention-2 backend for head size %d.",
-                    head_size)
-                selected_backend = _Backend.XFORMERS
-        except ImportError:
-            logger.info(
-                "Cannot use FlashAttention-2 backend because the "
-                "vllm.vllm_flash_attn package is not found. "
-                "Make sure that vllm_flash_attn was built and installed "
-                "(on by default).")
-            selected_backend = _Backend.XFORMERS
-
-    return selected_backend
+    # get device-specific attn_backend
+    attention_cls = current_platform.get_attn_backend_cls(
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+    if not attention_cls:
+        raise ValueError(
+            f"Invalid attention backend for {current_platform.device_name}")
+    return resolve_obj_by_qualname(attention_cls)
 
 
 @contextmanager
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 7ba7f5150150c..eb3e269cac285 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -28,10 +28,13 @@ def get_device_name(cls, device_id: int = 0) -> str:
         return "cpu"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
-        return _Backend.TORCH_SDPA
+        logger.info("Using Torch SDPA backend.")
+        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3c5350b778345..23ceac83e49de 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -16,7 +16,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -141,6 +141,81 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
+        if use_v1:
+            logger.info("Using Flash Attention backend on V1 engine.")
+            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}")
+
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif kv_cache_dtype is not None and \
+            kv_cache_dtype.startswith("fp8"):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
+            logger.warning(
+                "Please use FlashInfer backend with FP8 KV Cache for "
+                "better performance by setting environment variable  "
+                "VLLM_ATTENTION_BACKEND=FLASHINFER")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                import vllm.vllm_flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend)
+
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 0a44f2b74163a..8152d881fa8d9 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -21,8 +21,11 @@ class HpuPlatform(Platform):
     dispatch_key: str = "HPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
-        return _Backend.HPU_ATTN
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        logger.info("Using HPUAttention backend.")
+        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ddccaa2ce0148..f440358f65fbb 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -112,9 +112,11 @@ def is_cuda_alike(self) -> bool:
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend):
-        """Get the default attention backend of a device."""
-        return None
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
 
     @classmethod
     def get_device_capability(
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 16eb8dc81efc2..9390eda535c8f 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -28,10 +28,13 @@ class OpenVinoPlatform(Platform):
     dispatch_key: str = "CPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
-        return _Backend.OPENVINO
+        logger.info("Using OpenVINO Attention backend.")
+        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index aa779f265135f..1c2f602efc856 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -70,7 +70,8 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
@@ -79,7 +80,8 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
                 logger.info("flash_attn is not supported on NAVI GPUs.")
         else:
             logger.info("%s is not supported in AMD GPUs.", selected_backend)
-        return _Backend.ROCM_FLASH
+        logger.info("Using ROCmFlashAttention backend.")
+        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
 
     @classmethod
     @lru_cache(maxsize=8)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d488daf056f1a..8a59b53ca4b15 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -24,10 +24,13 @@ class TpuPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
-        return _Backend.PALLAS
+        logger.info("Using Pallas backend.")
+        return "vllm.attention.backends.pallas.PallasAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 78e17c2afec65..00692a5d23031 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -21,10 +21,13 @@ class XPUPlatform(Platform):
     dispatch_key: str = "XPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
-        return _Backend.IPEX
+        logger.info("Using IPEX attention backend.")
+        return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
 
     @staticmethod
     def get_device_capability(device_id: int = 0) -> DeviceCapability:

From bd8287221187279c668ac10c3edd5242b8d8b429 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 22:47:29 +0800
Subject: [PATCH 36/55] [ci]try to fix flaky multi-step tests (#11894)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/multi_step/test_correctness_async_llm.py | 3 +--
 tests/utils.py                                 | 9 +++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 7203d635c2fa8..8456a463adeeb 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -16,7 +16,6 @@
 NUM_PROMPTS = [10]
 
 DEFAULT_SERVER_ARGS: List[str] = [
-    "--disable-log-requests",
     "--worker-use-ray",
     "--gpu-memory-utilization",
     "0.85",
@@ -110,7 +109,7 @@ async def test_multi_step(
 
     # Spin up client/server & issue completion API requests.
     # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
+    # was raised 5x to 1200 *just for this test* due to
     # observed timeouts in GHA CI
     ref_completions = await completions_with_server_args(
         prompts,
diff --git a/tests/utils.py b/tests/utils.py
index bf3d88194e4ca..f4eecf19e8c64 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -157,13 +157,19 @@ def url_root(self) -> str:
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
 
-    def get_client(self):
+    def get_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.OpenAI(
             base_url=self.url_for("v1"),
             api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
         )
 
     def get_async_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.AsyncOpenAI(base_url=self.url_for("v1"),
                                   api_key=self.DUMMY_API_KEY,
                                   max_retries=0,
@@ -780,7 +786,6 @@ async def completions_with_server_args(
     assert len(max_tokens) == len(prompts)
 
     outputs = None
-    max_wait_seconds = 240 * 3  # 240 is default
     with RemoteOpenAIServer(model_name,
                             server_cli_args,
                             max_wait_seconds=max_wait_seconds) as server:

From 9a228348d2f9a2c85dfc67d6b9fe883bf10a4680 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 01:19:37 +0800
Subject: [PATCH 37/55] [Misc] Provide correct Pixtral-HF chat template
 (#11891)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 61 ++++++++++++++------------
 examples/template_pixtral_hf.jinja     | 38 ++++++++++++++++
 tests/entrypoints/test_chat_utils.py   |  1 +
 3 files changed, 73 insertions(+), 27 deletions(-)
 create mode 100644 examples/template_pixtral_hf.jinja

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3ba34c77205e5..acbe27a22a679 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -322,7 +322,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2ForCausalLM`
-  - Qwen2
+  - QwQ, Qwen2
   - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
   - ✅︎
   - ✅︎
@@ -436,7 +436,7 @@ loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/t
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
 #### Reward Modeling (`--task reward`)
@@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
 For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -499,7 +499,7 @@ e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "r
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
@@ -550,6 +550,28 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
+````{important}
+To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+
+Offline inference:
+```python
+llm = LLM(
+    model="Qwen/Qwen2-VL-7B-Instruct",
+    limit_mm_per_prompt={"image": 4},
+)
+```
+
+Online inference:
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```
+````
+
+```{note}
+vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```
+
 ### Generative Models
 
 See [this page](#generative-models) for more information on how to use generative models.
@@ -689,14 +711,14 @@ See [this page](#generative-models) for more information on how to use generativ
 * - `Phi3VForCausalLM`
   - Phi-3-Vision, Phi-3.5-Vision
   - T + I<sup>E+</sup>
-  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc.
+  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
   -
   - ✅︎
   - ✅︎
 * - `PixtralForConditionalGeneration`
   - Pixtral
   - T + I<sup>+</sup>
-  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc.
+  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
   -
   - ✅︎
   - ✅︎
@@ -715,7 +737,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2VLForConditionalGeneration`
-  - Qwen2-VL
+  - QVQ, Qwen2-VL
   - T + I<sup>E+</sup> + V<sup>E+</sup>
   - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
   - ✅︎
@@ -733,26 +755,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-````{important}
-To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
-
-```python
-llm = LLM(
-    model="Qwen/Qwen2-VL-7B-Instruct",
-    limit_mm_per_prompt={"image": 4},
-)
-```
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
-```
-````
-
-```{note}
-vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-```
-
 ```{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
@@ -762,6 +764,11 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
+```{note}
+The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
+A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+```
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja
new file mode 100644
index 0000000000000..e94661cb39071
--- /dev/null
+++ b/examples/template_pixtral_hf.jinja
@@ -0,0 +1,38 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message["role"] == "user" %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST]" + system_message + "\n" }}
+        {%- else %}
+            {{- "[INST]" }}
+        {%- endif %}
+        {%- if message["content"] is not string %}
+            {%- for chunk in message["content"] %}
+                {%- if chunk["type"] == "text" %}
+                    {{- chunk["text"] }}
+                {%- elif chunk["type"] == "image" %}
+                    {{- "[IMG]" }}
+                {%- else %}
+                    {{- raise_exception("Unrecognized content type!") }}
+                {%- endif %}
+            {%- endfor %}
+        {%- else %}
+            {{- message["content"] }}
+        {%- endif %}
+        {{- "[/INST]" }}
+    {%- elif message["role"] == "assistant" %}
+        {{- message["content"] + eos_token}}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index d63b963522e73..8f242df4a60e3 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -758,6 +758,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
+     ("template_pixtral_hf.jinja", "openai"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),

From 36f5303578397d122693a19007be38ba2f02bcbc Mon Sep 17 00:00:00 2001
From: Charles Frye <cfrye59@gmail.com>
Date: Thu, 9 Jan 2025 15:26:37 -0800
Subject: [PATCH 38/55] [Docs] Add Modal to deployment frameworks (#11907)

---
 docs/source/deployment/frameworks/bentoml.md | 2 +-
 docs/source/deployment/frameworks/index.md   | 1 +
 docs/source/deployment/frameworks/modal.md   | 7 +++++++
 3 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/deployment/frameworks/modal.md

diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/source/deployment/frameworks/bentoml.md
index ea0b5d1d4c93b..2bf435bda8380 100644
--- a/docs/source/deployment/frameworks/bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@@ -2,6 +2,6 @@
 
 # BentoML
 
-[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
 
 For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index 6a59131d36618..964782763f6b3 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -8,6 +8,7 @@ cerebrium
 dstack
 helm
 lws
+modal
 skypilot
 triton
 ```
diff --git a/docs/source/deployment/frameworks/modal.md b/docs/source/deployment/frameworks/modal.md
new file mode 100644
index 0000000000000..e7c42088e36a9
--- /dev/null
+++ b/docs/source/deployment/frameworks/modal.md
@@ -0,0 +1,7 @@
+(deployment-modal)=
+
+# Modal
+
+vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
+
+For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).

From c3cf54dda4df200bc8913ed69d210a7108dfa320 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:10:12 +0800
Subject: [PATCH 39/55] [Doc][5/N] Move Community and API Reference to the
 bottom (#11896)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 README.md                                     |  2 +-
 .../source/design/automatic_prefix_caching.md |  2 +-
 docs/source/index.md                          | 62 ++++++++++++-------
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 253a0bb913e37..67c557bfe13a9 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md
index 4398536b2b4ad..6d3dd056e6a60 100644
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -2,7 +2,7 @@
 
 # Automatic Prefix Caching
 
-The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 23e4304fe29d9..356fa4b7fd573 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -26,7 +26,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
@@ -54,6 +54,8 @@ For more information, check out the following:
 
 ## Documentation
 
+% How to start using vLLM?
+
 ```{toctree}
 :caption: Getting Started
 :maxdepth: 1
@@ -65,6 +67,8 @@ getting_started/troubleshooting
 getting_started/faq
 ```
 
+% What does vLLM support?
+
 ```{toctree}
 :caption: Models
 :maxdepth: 1
@@ -75,6 +79,8 @@ models/supported_models
 models/extensions/index
 ```
 
+% Additional capabilities
+
 ```{toctree}
 :caption: Features
 :maxdepth: 1
@@ -89,6 +95,8 @@ features/spec_decode
 features/compatibility_matrix
 ```
 
+% Details about running vLLM
+
 ```{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
@@ -104,6 +112,8 @@ serving/usage_stats
 serving/integrations/index
 ```
 
+% Scaling up vLLM for production
+
 ```{toctree}
 :caption: Deployment
 :maxdepth: 1
@@ -115,6 +125,8 @@ deployment/frameworks/index
 deployment/integrations/index
 ```
 
+% Making the most out of vLLM
+
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
@@ -123,28 +135,7 @@ performance/optimization
 performance/benchmarks
 ```
 
-% Community: User community resources
-
-```{toctree}
-:caption: Community
-:maxdepth: 1
-
-community/meetups
-community/sponsors
-```
-
-```{toctree}
-:caption: API Reference
-:maxdepth: 2
-
-api/offline_inference/index
-api/engine/index
-api/inference_params
-api/multimodal/index
-api/model/index
-```
-
-% Design Documents: Details about vLLM internals
+% Explanation of vLLM internals
 
 ```{toctree}
 :caption: Design Documents
@@ -159,7 +150,7 @@ design/automatic_prefix_caching
 design/multiprocessing
 ```
 
-% Developer Guide: How to contribute to the vLLM project
+% How to contribute to the vLLM project
 
 ```{toctree}
 :caption: Developer Guide
@@ -172,6 +163,29 @@ contributing/model/index
 contributing/vulnerability_management
 ```
 
+% Technical API specifications
+
+```{toctree}
+:caption: API Reference
+:maxdepth: 2
+
+api/offline_inference/index
+api/engine/index
+api/inference_params
+api/multimodal/index
+api/model/index
+```
+
+% Latest news and acknowledgements
+
+```{toctree}
+:caption: Community
+:maxdepth: 1
+
+community/meetups
+community/sponsors
+```
+
 # Indices and tables
 
 - {ref}`genindex`

From b844b99ad309b05f37b1acb5360c82be7b16281d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:24:00 +0800
Subject: [PATCH 40/55] [VLM] Enable tokenized inputs for merged multi-modal
 processor (#11900)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py      |  31 ++++--
 vllm/inputs/data.py                      |   4 +-
 vllm/inputs/preprocess.py                |   4 -
 vllm/model_executor/models/blip2.py      |  22 +++-
 vllm/model_executor/models/chameleon.py  |  32 +++++-
 vllm/model_executor/models/fuyu.py       |  24 +++--
 vllm/model_executor/models/interfaces.py |   8 +-
 vllm/model_executor/models/llava.py      |   8 +-
 vllm/model_executor/models/phi3v.py      |   4 +-
 vllm/model_executor/models/ultravox.py   |  18 ++--
 vllm/multimodal/processing.py            | 127 ++++++++++++++++-------
 vllm/multimodal/profiling.py             |   2 +-
 12 files changed, 207 insertions(+), 77 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d98bd9736b65f..d18909a4197b6 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -649,7 +649,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         )
 
 
-def _test_processing_cache_correctness(
+def _test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -691,6 +691,7 @@ def _test_processing_cache_correctness(
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
     dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
 
     rng = np.random.RandomState(0)
 
@@ -747,7 +748,25 @@ def _test_processing_cache_correctness(
         )
 
         assert baseline_result == cached_result, (
-            f"Failed ({batch_idx=}, {mm_data=})")
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        baseline_tokenized_result = baseline_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == baseline_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        cached_tokenized_result = cached_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert cached_result == cached_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
 
 # yapf: disable
@@ -771,14 +790,14 @@ def _test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness(
+def test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
@@ -795,7 +814,7 @@ def test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness_phi3v(
+def test_processing_correctness_phi3v(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -809,7 +828,7 @@ def test_processing_cache_correctness_phi3v(
 
     AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
 
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index cdaf6dd76eaa1..b8163a7acde1d 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -44,13 +44,13 @@ class TokensPrompt(TypedDict):
 
     multi_modal_data: NotRequired["MultiModalDataDict"]
     """
-    DEPRECATED: Optional multi-modal data to pass to the model,
+    Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
-    DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the
+    Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
     have registered mappers etc for the model being considered, we attempt
     to pass the mm_processor_kwargs to each of them.
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index a738ffe18e3ae..0890883cc984f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -279,10 +279,6 @@ async def _process_multimodal_async(
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
-        if isinstance(prompt, list):
-            logger.warning("Passing `multi_modal_data` in TokensPrompt is"
-                           "deprecated and will be removed in a future update")
-            prompt = tokenizer.decode(prompt)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7dfc0b687c6e3..917b88e802071 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -441,6 +441,24 @@ def get_dummy_processor_inputs(
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # HF processor always adds placeholders even when there's no image
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -469,11 +487,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 452fe727875fe..a6634204699c9 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -99,6 +99,34 @@ def get_dummy_processor_inputs(
 class ChameleonMultiModalProcessor(
         BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds sep token for chat mode
+        tokenizer = self.info.get_tokenizer()
+        sep_token_id: int = \
+            tokenizer.vocab[tokenizer.sep_token]  # type: ignore
+
+        return prompt_tokens + [sep_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -128,11 +156,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the image_start_token and image_end_token
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 59af5f0b3ae98..63e7147f84e03 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,7 +16,7 @@
 """ PyTorch Fuyu model."""
 import math
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict)
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -149,14 +149,10 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-
         if not mm_data:
             # Avoid warning from HF logger for text-only input
-            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
-            # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self.info.get_tokenizer()
-            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
-            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         processed_outputs = super()._call_hf_processor(
@@ -181,6 +177,16 @@ def _call_hf_processor(
 
         return processed_outputs
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds boa_token_id
+        tokenizer = self.info.get_tokenizer()
+        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+
+        return prompt_tokens + [boa_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -223,11 +229,11 @@ def get_replacement_fuyu(item_idx: int):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only |SPEAKER| (image) tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index b51cba86ec1a4..c5fd0d9332379 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,13 +39,13 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
     
-        - A list or tuple of 2D tensors, where each tensor corresponds to 
-            each input multimodal data item (e.g, image).
+        - A list or tuple of 2D tensors, where each tensor corresponds to
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
         Note:
-            The returned multimodal embeddings must be in the same order as 
-            the appearances of their corresponding multimodal data item in the 
+            The returned multimodal embeddings must be in the same order as
+            the appearances of their corresponding multimodal data item in the
             input prompt.
         """
         ...
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8d94acf3b21d5..bb3db60c7d8ed 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -724,7 +724,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -737,7 +737,7 @@ def apply(
             image_height=-1,
         )
 
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
@@ -760,7 +760,7 @@ def get_replacement_mantis(item_idx: int):
             )
         ])
 
-        prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
+        prompt_ids, prompt, _ = self._apply_prompt_replacements(
             result["prompt_token_ids"],
             mantis_mm_repls,
             mm_item_counts,
@@ -788,7 +788,7 @@ def get_replacement_mantis(item_idx: int):
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_placeholders=mm_placeholder_ranges,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a1b1af35604db..7a230e5beb367 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -481,11 +481,11 @@ def _apply_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|image|> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index fada22d685dd6..3edfb5107683a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -138,12 +138,8 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self.info.get_tokenizer()
-
-            prompt_ids = tokenizer.encode(
-                prompt,
-                add_special_tokens=False,  # type: ignore
-            )
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         mm_data = dict(mm_data)
@@ -188,6 +184,16 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor omits bos_token_id by setting add_special_tokens=False
+        tokenizer = self.info.get_tokenizer()
+        assert prompt_tokens[0] == tokenizer.bos_token_id
+
+        return prompt_tokens[1:]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 07d883d5d7295..8b47dfb07387f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -725,15 +725,15 @@ def _call_hf_processor(
             mm_kwargs,
         )
 
-    def _apply_hf_processor(
+    def _apply_hf_processor_text_mm(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Wrapper of :meth:`_call_hf_processor` that applies
-        additional pre-processing and post-processing.
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -753,40 +753,93 @@ def _apply_hf_processor(
 
         return prompt_ids, mm_kwargs
 
-    def _apply_hf_processor_missing(
-        self,
-        prompt_text: str,
-        mm_missing_data_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ):
+    def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
         """
-        Apply the HF processor on the full prompt text, but only on the
-        multi-modal data that are missing from the cache.
+        Apply the HF processor on the prompt text only.
 
-        Note:
-            We pass prompt text and multi-modal data into the HF processor
-            in separate calls to avoid HF prompt replacement being done for
-            cached items; instead, we rely on our own prompt replacement logic
-            (:meth:`_get_prompt_replacements`) for the full text.
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we create dummy multi-modal items
+        to go along with the text.
         """
-        mm_missing_counts = mm_missing_data_items.get_all_counts()
-
-        prompt_ids, _ = self._apply_hf_processor(
+        prompt_ids, _ = self._apply_hf_processor_text_mm(
             prompt_text=prompt_text,
             mm_items=MultiModalDataItems({}),
             hf_processor_mm_kwargs={},
         )
 
-        # Some HF processors (e.g. Qwen2-VL) expect corresponding
-        # multi-modal tokens to be in the prompt text
+        return prompt_ids
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        """
+        Apply the HF processor on the prompt tokens only.
+
+        Most HF processors accept prompt text but not prompt tokens.
+        If the HF processor adds or removes tokens that are not related to
+        multi-modal data, you should override this method so it is consistent
+        with the output of :meth:`_apply_hf_processor_text_only` on the
+        corresponding text.
+        """
+        return prompt_tokens
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Apply the HF processor on the multi-modal data only.
+
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we generate dummy text using
+        :class:`DummyInputsBuilder` to go along with the multi-modal data.
+        """
+        mm_counts = mm_items.get_all_counts()
+
         dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
             self.info.ctx.model_config.max_model_len,
-            mm_missing_counts,
+            mm_counts,
         )
 
-        _, mm_missing_kwargs = self._apply_hf_processor(
+        _, mm_kwargs = self._apply_hf_processor_text_mm(
             prompt_text=dummy_inputs.prompt_text,
-            mm_items=mm_missing_data_items,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_replacement: bool,
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data.
+
+        Note:
+            If :code:`enable_hf_prompt_replacement=False`, the prompt should
+            correspond to the multi-modal items.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_replacement:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+
+            prompt_ids = self._apply_hf_processor_text_only(prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_missing_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
 
@@ -794,7 +847,7 @@ def _apply_hf_processor_missing(
 
     def _cached_apply_hf_processor(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
@@ -807,10 +860,11 @@ def _cached_apply_hf_processor(
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
-            return self._apply_hf_processor(
-                prompt_text=prompt_text,
+            return self._apply_hf_processor_main(
+                prompt=prompt,
                 mm_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_replacement=True,
             )
 
         mm_maybe_cached_kw_items = {
@@ -832,10 +886,13 @@ def _cached_apply_hf_processor(
         }
         mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
-        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
-            prompt_text=prompt_text,
-            mm_missing_data_items=mm_missing_data_items,
+        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
+        # so we need to pass `enable_hf_prompt_replacement=False`
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_missing_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            enable_hf_prompt_replacement=False,
         )
 
         mm_missing_next_idx = {
@@ -1018,7 +1075,7 @@ def _validate_mm_placeholders(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -1056,7 +1113,7 @@ def apply(
             mm_hashes = None
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
-            prompt_text,
+            prompt,
             mm_items,
             hf_processor_mm_kwargs,
         )
@@ -1101,12 +1158,12 @@ def apply(
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
             tokenizer = self.info.get_tokenizer()
-            prompt_text = decode_tokens(tokenizer, prompt_ids)
+            prompt = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
             (
                 prompt_ids,
-                prompt_text,
+                prompt,
                 missing_mm_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
@@ -1125,7 +1182,7 @@ def apply(
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_hashes=mm_hashes,
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 6f7da1509990f..ec580cd6ecddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -137,7 +137,7 @@ def _get_dummy_mm_inputs(
             seq_len, mm_counts)
 
         return self.processor.apply(
-            prompt_text=processor_inputs.prompt_text,
+            prompt=processor_inputs.prompt_text,
             mm_data=processor_inputs.mm_data,
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )

From 3de2b1eafb12e420c563cb7153d4d2f0e8451ca9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:25:20 +0800
Subject: [PATCH 41/55] [Doc] Show default pooling method in a table (#11904)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/generative_models.md |  8 ++--
 docs/source/models/pooling_models.md    | 59 +++++++++++++++++--------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 6228c7c2ac957..a9f74c4d3fbb8 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -8,14 +8,14 @@ In vLLM, generative models implement the {class}`~vllm.model_executor.models.Vll
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
 
+For generative models, the only supported `--task` option is `"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
 ## Offline Inference
 
 The {class}`~vllm.LLM` class provides various methods for offline inference.
 See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
-For generative models, the only supported {code}`task` option is {code}`"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
-
 ### `LLM.generate`
 
 The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
@@ -33,7 +33,7 @@ for output in outputs:
 ```
 
 You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
-For example, you can use greedy sampling by setting {code}`temperature=0`:
+For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
 llm = LLM(model="facebook/opt-125m")
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 3e4407cfdc233..745f3fd81980d 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -14,30 +14,53 @@ As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM feature
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 ```
 
-## Offline Inference
-
-The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
-
-For pooling models, we support the following {code}`task` options:
-
-- Embedding ({code}`"embed"` / {code}`"embedding"`)
-- Classification ({code}`"classify"`)
-- Sentence Pair Scoring ({code}`"score"`)
-- Reward Modeling ({code}`"reward"`)
+For pooling models, we support the following `--task` options.
+The selected option sets the default pooler used to extract the final hidden states:
+
+```{list-table}
+:widths: 50 25 25 25
+:header-rows: 1
+
+* - Task
+  - Pooling Type
+  - Normalization
+  - Softmax
+* - Embedding (`embed`)
+  - `LAST`
+  - ✅︎
+  - ✗
+* - Classification (`classify`)
+  - `LAST`
+  - ✗
+  - ✅︎
+* - Sentence Pair Scoring (`score`)
+  - \*
+  - \*
+  - \*
+* - Reward Modeling (`reward`)
+  - `ALL`
+  - ✗
+  - ✗
+```
 
-The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used:
+\*The default pooler is always defined by the model.
 
-- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
-- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Reward Modeling: Extract all of the hidden states and return them directly.
+```{note}
+If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
+```
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
-we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`).
+we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-You can customize the model's pooling method via the {code}`override_pooler_config` option,
+```{tip}
+You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
+```
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
 ### `LLM.encode`
 

From cf5f000d218fbcbc4bf404de8ed9d9607a128c3b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 10 Jan 2025 13:14:42 +0800
Subject: [PATCH 42/55] [torch.compile] Hide KV cache behind torch.compile
 boundary (#11677)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/kernels/test_encoder_decoder_attn.py | 18 +++--
 tests/test_utils.py                        | 85 +++++++++++++++++++++-
 tests/v1/engine/test_engine_core.py        |  3 +
 tests/v1/engine/test_engine_core_client.py |  3 +
 vllm/attention/layer.py                    | 29 +++++---
 vllm/config.py                             |  1 -
 vllm/forward_context.py                    | 33 +++++----
 vllm/utils.py                              | 35 +++++++++
 vllm/v1/worker/gpu_model_runner.py         |  6 +-
 vllm/worker/cpu_enc_dec_model_runner.py    |  3 +-
 vllm/worker/cpu_model_runner.py            |  3 +-
 vllm/worker/cpu_pooling_model_runner.py    |  3 +-
 vllm/worker/cpu_worker.py                  |  4 +-
 vllm/worker/enc_dec_model_runner.py        |  3 +-
 vllm/worker/model_runner.py                |  5 +-
 vllm/worker/pooling_model_runner.py        |  3 +-
 vllm/worker/worker.py                      |  4 +-
 vllm/worker/worker_base.py                 |  1 +
 18 files changed, 198 insertions(+), 44 deletions(-)

diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 614674375786e..e008a56de6208 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -142,12 +142,18 @@ class that Attention will automatically select when it is constructed.
             torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
 
     # Construct KV cache
-    kv_cache = make_kv_cache(test_pt.num_blocks,
-                             test_pt.num_heads,
-                             test_pt.head_size,
-                             test_pt.block_size,
-                             device=CUDA_DEVICE,
-                             backend=test_pt.backend_name)
+    if test_pt.attn_type in (AttentionType.DECODER,
+                             AttentionType.ENCODER_DECODER):
+        kv_cache = make_kv_cache(test_pt.num_blocks,
+                                 test_pt.num_heads,
+                                 test_pt.head_size,
+                                 test_pt.block_size,
+                                 device=CUDA_DEVICE,
+                                 backend=test_pt.backend_name)
+    else:
+        kv_cache = torch.tensor([])
+
+    attn.kv_cache = [kv_cache]
     return TestResources(scale, attn, kv_cache)
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 14d2fbd63b90d..6810e0302f897 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,11 @@
 import torch
 from vllm_test_utils import monitor
 
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
-                        StoreBoolean, deprecate_kwargs, get_open_port,
-                        memory_profiling, merge_async_iterators, supports_kw)
+                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -325,6 +327,85 @@ def measure_current_non_torch():
     lib.cudaFree(handle2)
 
 
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
+
+
+def test_bind_kv_cache_encoder_decoder():
+    from vllm.attention import Attention, AttentionType
+
+    # example from bart
+    ctx = {
+        'encoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+        'decoder.layers.0.encoder_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+        'decoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+    }
+
+    kv_cache = [
+        torch.zeros((1, )),
+    ]
+    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+
+
+def test_bind_kv_cache_pp():
+    cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
+    with set_current_vllm_config(cfg):
+        from vllm.attention import Attention
+
+        ctx = {
+            'layers.0.self_attn': Attention(32, 128, 0.1),
+        }
+        kv_cache = [
+            [torch.zeros((1, ))],
+            [torch.zeros((1, ))]
+        ]
+        bind_kv_cache(ctx, kv_cache)
+        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
+        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 8dd9b23fbdd5f..5b1732036e807 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -4,6 +4,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -36,6 +37,7 @@ def make_request() -> EngineCoreRequest:
     )
 
 
+@fork_new_process_for_each_test
 def test_engine_core(monkeypatch):
 
     with monkeypatch.context() as m:
@@ -138,6 +140,7 @@ def test_engine_core(monkeypatch):
         assert len(engine_core.scheduler.running) == 0
 
 
+@fork_new_process_for_each_test
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 5a21806e57a11..7eac16f2cf542 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -6,6 +6,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -75,6 +76,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
             break
 
 
+@fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
 def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
@@ -143,6 +145,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         client.abort_requests([request.request_id])
 
 
+@fork_new_process_for_each_test
 @pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index f1b3598e60b54..55e4e14027f79 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -121,6 +121,13 @@ def __init__(
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
         self.attn_type = attn_type
+        # use a placeholder kv cache tensor during init, which will be replaced
+        # by bind_kv_cache
+        # this variable will not be accessed if use_direct_call is True
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
 
     def forward(
         self,
@@ -148,11 +155,11 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, kv_cache, self.layer_name)
+                query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
-                                                    kv_cache, self.layer_name)
+                                                    self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -230,12 +237,12 @@ def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     return self.impl.forward(query, key, value, kv_cache, attn_metadata,
                              self._k_scale, self._v_scale)
 
@@ -244,7 +251,6 @@ def unified_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
@@ -253,7 +259,7 @@ def unified_attention_fake(
 direct_register_custom_op(
     op_name="unified_attention",
     op_func=unified_attention,
-    mutates_args=["kv_cache"],
+    mutates_args=[],
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
 )
@@ -264,12 +270,12 @@ def unified_attention_with_output(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     self.impl.forward(query,
                       key,
                       value,
@@ -285,7 +291,6 @@ def unified_attention_with_output_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     return
@@ -294,7 +299,7 @@ def unified_attention_with_output_fake(
 direct_register_custom_op(
     op_name="unified_attention_with_output",
     op_func=unified_attention_with_output,
-    mutates_args=["kv_cache", "output"],
+    mutates_args=["output"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
 )
diff --git a/vllm/config.py b/vllm/config.py
index 19609085cc960..13b5390008a35 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2780,7 +2780,6 @@ def model_post_init(self, __context: Any) -> None:
     compilation_time: float = PrivateAttr
 
     # Per-model forward context
-    # Mainly used to store attention cls
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 7f56575279e9b..828b394ec5d21 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
@@ -10,6 +10,9 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -21,9 +24,12 @@
 
 @dataclass
 class ForwardContext:
-    static_forward_context: Dict[str, Any]
+    # copy from vllm_config.compilation_config.static_forward_context
+    attn_layers: Dict[str, Any]
     # TODO: extend to support per-layer dynamic forward context
-    dynamic_forward_context: Any
+    attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
+    # TODO: remove after making all virtual_engines share the same kv cache
+    virtual_engine: int  # set dynamically for each forward pass
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -38,34 +44,35 @@ def get_forward_context() -> ForwardContext:
 
 
 @contextmanager
-def set_forward_context(context: Any, vllm_config: VllmConfig):
+def set_forward_context(attn_metadata: Any,
+                        vllm_config: VllmConfig,
+                        virtual_engine: int = 0):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
     """
     global forward_start_time
-    need_to_track_batchsize = track_batchsize and context is not None
+    need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
-        static_forward_context=vllm_config.compilation_config.
-        static_forward_context,
-        dynamic_forward_context=context)
+        attn_layers=vllm_config.compilation_config.static_forward_context,
+        virtual_engine=virtual_engine,
+        attn_metadata=attn_metadata)
     try:
         yield
     finally:
-        global batchsize_counter
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(context, "num_prefill_tokens"):
+            if hasattr(attn_metadata, "num_prefill_tokens"):
                 # for v0 attention backends
-                batchsize = context.num_prefill_tokens + \
-                    context.num_decode_tokens
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
             else:
                 # for v1 attention backends
-                batchsize = context.num_input_tokens
+                batchsize = attn_metadata.num_input_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
diff --git a/vllm/utils.py b/vllm/utils.py
index 487088591ebc2..8c3e5200b3d98 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2138,3 +2138,38 @@ def get_mp_context():
     _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
+
+
+def bind_kv_cache(
+        ctx: Dict[str, Any],
+        kv_cache: List[List[torch.Tensor]],  # [virtual_engine][layer_index]
+) -> None:
+    # Bind the kv_cache tensor to Attention modules, similar to
+    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
+    # Special things handled here:
+    # 1. Some models have non-attention layers, e.g., Jamba
+    # 2. Pipeline parallelism, each rank only has a subset of layers
+    # 3. Encoder attention has no kv cache
+    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
+    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
+    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
+    #    tensor
+    from vllm.attention import AttentionType
+    from vllm.model_executor.models.utils import extract_layer_index
+    layer_need_kv_cache = [
+        layer_name for layer_name in ctx
+        if ctx[layer_name].attn_type in (AttentionType.DECODER,
+                                         AttentionType.ENCODER_DECODER)
+    ]
+    layer_index_sorted = sorted(
+        set(
+            extract_layer_index(layer_name)
+            for layer_name in layer_need_kv_cache))
+    for layer_name in layer_need_kv_cache:
+        kv_cache_idx = layer_index_sorted.index(
+            extract_layer_index(layer_name))
+        forward_ctx = ctx[layer_name]
+        assert len(forward_ctx.kv_cache) == len(kv_cache)
+        for ve, ve_kv_cache in enumerate(kv_cache):
+            assert forward_ctx.kv_cache[ve].numel() == 0
+            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a1d4f9b135789..fb87dc5a8222a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -16,7 +16,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, cdiv, is_pin_memory_available)
+                        LayerBlockType, bind_kv_cache, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
@@ -860,3 +861,6 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
+        bind_kv_cache(
+            self.vllm_config.compilation_config.static_forward_context,
+            [self.kv_caches])
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index cc24cfe04d2ba..fa6775cbd6c66 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -305,7 +305,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index f1531e0fc0675..d99db4e0c6c40 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -526,7 +526,8 @@ def execute_model(
             execute_model_kwargs.update(
                 {"previous_hidden_states": previous_hidden_states})
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 17b2fd2564a04..d31ba89e12375 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -69,7 +69,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b5dfebfce6f75..494c6506f3c0f 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -13,7 +13,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
@@ -293,6 +293,8 @@ def _init_cache_engine(self) -> None:
             self.cache_engine[ve].cpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.cpu_cache)
         self.model_runner.block_size = self.cache_engine[0].block_size
 
         assert all(
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 4d5d918087be8..8a161b740042d 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -175,7 +175,8 @@ def execute_model(
         } if self.has_inner_state else {}
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1c6d1bbee78ee..2b918483d3675 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1527,7 +1527,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         self._update_inputs_to_capture_for_enc_dec_model(
                             capture_inputs)
 
-                    with set_forward_context(attn_metadata, self.vllm_config):
+                    with set_forward_context(attn_metadata, self.vllm_config,
+                                             virtual_engine):
                         graph_runner.capture(**capture_inputs)
                     self.graph_memory_pool = graph_runner.graph.pool()
                     self.graph_runners[virtual_engine][batch_size] = (
@@ -1695,7 +1696,7 @@ def execute_model(
 
         if not bypass_model_exec:
             with set_forward_context(model_input.attn_metadata,
-                                     self.vllm_config):
+                                     self.vllm_config, virtual_engine):
                 hidden_or_intermediate_states = model_executable(
                     input_ids=model_input.input_tokens,
                     positions=model_input.input_positions,
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index f79b3773bcbd2..6de227f3cb2b9 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -105,7 +105,8 @@ def execute_model(
         if model_input.token_types is not None:
             cross_enc_kwargs["token_type_ids"] = model_input.token_types
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f51b51d433d3d..0f12549e3f3fd 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -21,7 +21,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import GiB_bytes, memory_profiling
+from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -285,6 +285,8 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.gpu_cache)
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 249b3ed2dfd37..a835718e1db19 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -43,6 +43,7 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.compilation_config = vllm_config.compilation_config
         from vllm.platforms import current_platform
         self.current_platform = current_platform
 

From ac2f3f7fee93cf9cd97c0078e362feab7b6c8299 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Fri, 10 Jan 2025 00:56:36 -0700
Subject: [PATCH 43/55] [Bugfix] Validate lora adapters to avoid crashing
 server (#11727)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../entrypoints/openai/test_lora_adapters.py  | 269 ++++++++++++++++++
 tests/entrypoints/openai/test_lora_lineage.py | 109 -------
 tests/entrypoints/openai/test_serving_chat.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |  10 +-
 tests/entrypoints/openai/test_shutdown.py     |  27 +-
 vllm/engine/async_llm_engine.py               |   4 +
 vllm/engine/multiprocessing/__init__.py       |  20 +-
 vllm/engine/multiprocessing/client.py         |  42 ++-
 vllm/engine/multiprocessing/engine.py         |  27 +-
 vllm/engine/protocol.py                       |   5 +
 vllm/entrypoints/openai/api_server.py         |   7 +-
 vllm/entrypoints/openai/run_batch.py          |   1 +
 vllm/entrypoints/openai/serving_models.py     |  78 +++--
 vllm/lora/worker_manager.py                   |  19 +-
 vllm/v1/engine/async_llm.py                   |   4 +
 15 files changed, 459 insertions(+), 171 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_lora_adapters.py
 delete mode 100644 tests/entrypoints/openai/test_lora_lineage.py

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
new file mode 100644
index 0000000000000..46a064f6d9e68
--- /dev/null
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -0,0 +1,269 @@
+import asyncio
+import json
+import shutil
+from contextlib import suppress
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_static_lora_lineage(client: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
+                                    zephyr_lora_files):
+
+    response = await client.post("load_lora_adapter",
+                                 cast_to=str,
+                                 body={
+                                     "lora_name": "zephyr-lora-3",
+                                     "lora_path": zephyr_lora_files
+                                 })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
+    with pytest.raises(openai.NotFoundError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "notfound",
+                              "lora_path": "/not/an/adapter"
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
+                                          tmp_path):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_files)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
+                                              tmp_path, zephyr_lora_files):
+    invalid_rank = tmp_path / "invalid_rank"
+
+    # Copy adapter from zephyr_lora_files to invalid_rank
+    shutil.copytree(zephyr_lora_files, invalid_rank)
+
+    with open(invalid_rank / "adapter_config.json") as f:
+        adapter_config = json.load(f)
+
+    print(adapter_config)
+
+    # assert False
+
+    # Change rank to invalid value
+    adapter_config["r"] = 1024
+    with open(invalid_rank / "adapter_config.json", "w") as f:
+        json.dump(adapter_config, f)
+
+    with pytest.raises(openai.BadRequestError,
+                       match="is greater than max_lora_rank"):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_rank)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
+                                      zephyr_lora_files):
+    """Validate that many loras can be dynamically registered and inferenced 
+    with concurrently"""
+
+    # This test file configures the server with --max-cpu-loras=2 and this test
+    # will concurrently load 10 adapters, so it should flex the LRU cache
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+            )
+
+    lora_tasks = []
+    for i in range(10):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+
+@pytest.mark.asyncio
+async def test_loading_invalid_adapters_does_not_break_others(
+        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
+
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    stop_good_requests_event = asyncio.Event()
+
+    async def run_good_requests(client):
+        # Run chat completions requests until event set
+
+        results = []
+
+        while not stop_good_requests_event.is_set():
+            try:
+                batch = await client.completions.create(
+                    model="zephyr-lora",
+                    prompt=["Hello there", "Foo bar bazz buzz"],
+                    max_tokens=5,
+                )
+                results.append(batch)
+            except Exception as e:
+                results.append(e)
+
+        return results
+
+    # Create task to run good requests
+    good_task = asyncio.create_task(run_good_requests(client))
+
+    # Run a bunch of bad adapter loads
+    for _ in range(25):
+        with suppress(openai.NotFoundError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "notfound",
+                                  "lora_path": "/not/an/adapter"
+                              })
+    for _ in range(25):
+        with suppress(openai.BadRequestError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "invalid",
+                                  "lora_path": str(invalid_files)
+                              })
+
+    # Ensure all the running requests with lora adapters succeeded
+    stop_good_requests_event.set()
+    results = await good_task
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+    # Ensure we can load another adapter and run it
+    await client.post("load_lora_adapter",
+                      cast_to=str,
+                      body={
+                          "lora_name": "valid",
+                          "lora_path": zephyr_lora_files
+                      })
+    await client.completions.create(
+        model="valid",
+        prompt=["Hello there", "Foo bar bazz buzz"],
+        max_tokens=5,
+    )
diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py
deleted file mode 100644
index ce4f85c13fff9..0000000000000
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import json
-
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from ...utils import RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
-    # Define the json format LoRA module configurations
-    lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    lora_module_2 = {
-        "name": "zephyr-lora2",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        json.dumps(lora_module_1),
-        json.dumps(lora_module_2),
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "64",
-    ]
-
-    # Enable the /v1/load_lora_adapter endpoint
-    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
-
-    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client_for_lora_lineage(server_with_lora_modules_json):
-    async with server_with_lora_modules_json.get_async_client(
-    ) as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                   zephyr_lora_files):
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    served_model = models[0]
-    lora_models = models[1:]
-    assert served_model.id == MODEL_NAME
-    assert served_model.root == MODEL_NAME
-    assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
-    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
-    assert lora_models[1].id == "zephyr-lora2"
-
-
-@pytest.mark.asyncio
-async def test_dynamic_lora_lineage(
-        client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
-
-    response = await client_for_lora_lineage.post("load_lora_adapter",
-                                                  cast_to=str,
-                                                  body={
-                                                      "lora_name":
-                                                      "zephyr-lora-3",
-                                                      "lora_path":
-                                                      zephyr_lora_files
-                                                  })
-    # Ensure adapter loads before querying /models
-    assert "success" in response
-
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
-    assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index f431d1065e0eb..85f485364a411 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -52,7 +52,7 @@ async def _async_serving_chat_init():
     engine = MockEngine()
     model_config = await engine.get_model_config()
 
-    models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
+    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
                                            models,
@@ -73,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=MockModelConfig())
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
@@ -116,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=mock_model_config)
     serving_chat = OpenAIServingChat(mock_engine,
                                      mock_model_config,
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 96897dc730da2..657ea20213ec9 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -4,6 +4,7 @@
 import pytest
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
@@ -21,13 +22,16 @@
 
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
+    mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
+                                         base_model_paths=BASE_MODEL_PATHS,
                                          model_config=mock_model_config,
                                          lora_modules=None,
                                          prompt_adapters=None)
+    await serving_models.init_static_loras()
 
     return serving_models
 
@@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found():
     request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.type == "NotFoundError"
+    assert response.code == HTTPStatus.NOT_FOUND
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 6fcc92022855b..090523a836e12 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,6 +1,3 @@
-import json
-import os
-
 import openai
 import pytest
 
@@ -10,16 +7,7 @@
 
 
 @pytest.mark.asyncio
-async def test_shutdown_on_engine_failure(tmp_path):
-    # Use a bad adapter to crash the engine
-    # (This test will fail when that bug is fixed)
-    adapter_path = tmp_path / "bad_adapter"
-    os.mkdir(adapter_path)
-    with open(adapter_path / "adapter_model_config.json", "w") as f:
-        json.dump({"not": "real"}, f)
-    with open(adapter_path / "adapter_model.safetensors", "wb") as f:
-        f.write(b"this is fake")
-
+async def test_shutdown_on_engine_failure():
     # dtype, max-len etc set so that this can run in CI
     args = [
         "--dtype",
@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path):
         "--enforce-eager",
         "--max-num-seqs",
         "128",
-        "--enable-lora",
-        "--lora-modules",
-        f"bad-adapter={tmp_path / 'bad_adapter'}",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path):
 
             with pytest.raises(
                 (openai.APIConnectionError, openai.InternalServerError)):
-                # This crashes the engine
-                await client.completions.create(model="bad-adapter",
-                                                prompt="Hello, my name is")
+                # Asking for lots of prompt logprobs will currently crash the
+                # engine. This may change in the future when that bug is fixed
+                prompt = "Hello " * 4000
+                await client.completions.create(
+                    model=MODEL_NAME,
+                    prompt=prompt,
+                    extra_body={"prompt_logprobs": 10})
 
             # Now the server should shut down
             return_code = remote_server.proc.wait(timeout=8)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 66a5089074ff5..da23ed19ef7be 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1257,6 +1257,10 @@ async def stop_profile(self) -> None:
         else:
             self.engine.model_executor._run_workers("stop_profile")
 
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        self.engine.add_lora(lora_request)
+
 
 # TODO(v1): Remove this class proxy when V1 goes default.
 if envs.VLLM_USE_V1:
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 420f540d0b5f4..7132f9840001a 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,4 +1,5 @@
-from dataclasses import dataclass
+import uuid
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import List, Mapping, Optional, Union, overload
 
@@ -120,10 +121,23 @@ class RPCUProfileRequest(Enum):
     STOP_PROFILE = 2
 
 
+@dataclass
+class RPCLoadAdapterRequest:
+    lora_request: LoRARequest
+    # Set the default value of request_id to a new UUID
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+
+@dataclass
+class RPCAdapterLoadedResponse:
+    request_id: str
+
+
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
-                      RPCUProfileRequest]
+                      RPCUProfileRequest, RPCLoadAdapterRequest]
 
-REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCError]
+REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
+                          RPCError]
 
 
 def ENGINE_DEAD_ERROR(
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0a046c71e86e8..a9ab899535180 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -25,8 +25,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, RPC_REQUEST_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 from vllm.engine.protocol import EngineClient
 # yapf: enable
@@ -240,17 +242,22 @@ async def run_output_handler_loop(self):
                         queue = self.output_queues.get(request_id)
                         if queue is not None:
                             queue.put_nowait(exception)
+                # Put each output into the appropriate queue.
+                elif isinstance(request_outputs, RPCAdapterLoadedResponse):
+                    self._add_output(request_outputs)
                 else:
-                    # Put each output into the appropriate steam.
                     for request_output in request_outputs:
-                        queue = self.output_queues.get(
-                            request_output.request_id)
-                        if queue is not None:
-                            queue.put_nowait(request_output)
+                        self._add_output(request_output)
 
         except asyncio.CancelledError:
             logger.debug("Shutting down MQLLMEngineClient output handler.")
 
+    def _add_output(self, request_output: Union[RequestOutput,
+                                                RPCAdapterLoadedResponse]):
+        queue = self.output_queues.get(request_output.request_id)
+        if queue is not None:
+            queue.put_nowait(request_output)
+
     async def setup(self):
         """Setup the client before it starts sending server requests."""
 
@@ -659,3 +666,24 @@ async def stop_profile(self) -> None:
 
         await self._send_one_way_rpc_request(
             request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        # Uses the same I/O as generate requests
+        request = RPCLoadAdapterRequest(lora_request)
+
+        # Create output queue for this requests.
+        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
+        self.output_queues[request.request_id] = queue
+
+        # Send the request
+        request_bytes = pickle.dumps(request)
+        await self.input_socket.send_multipart((request_bytes, ), copy=False)
+
+        # Wait for the response
+        request_output = await queue.get()
+        self.output_queues.pop(request.request_id)
+
+        # Raise on error, otherwise happily return None
+        if isinstance(request_output, BaseException):
+            raise request_output
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 49a90b321dac4..36f4df4b02731 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -14,8 +14,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
 from vllm.executor.gpu_executor import GPUExecutor
@@ -234,6 +236,8 @@ def handle_new_input(self):
                         self.start_profile()
                     else:
                         self.stop_profile()
+                elif isinstance(request, RPCLoadAdapterRequest):
+                    self._handle_load_adapter_request(request)
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -284,6 +288,19 @@ def _handle_abort_request(self, request: RPCAbortRequest):
         if self.log_requests:
             logger.info("Aborted request %s.", request.request_id)
 
+    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
+        try:
+            self.engine.add_lora(request.lora_request)
+        except BaseException as e:
+            # Send back an error if the adater fails to load
+            rpc_err = RPCError(request_id=request.request_id,
+                               is_engine_errored=False,
+                               exception=e)
+            self._send_outputs(rpc_err)
+        # Otherwise, send back the successful load message
+        self._send_outputs(
+            RPCAdapterLoadedResponse(request_id=request.request_id))
+
     def _health_check(self):
         # Send unhealthy if engine has already errored
         if self._errored_with is not None:
@@ -296,7 +313,11 @@ def _health_check(self):
             self._send_unhealthy(e)
 
     def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
-        """Send List of RequestOutput to RPCClient."""
+        """Send outputs back to the engine client. These can be:
+        - Exceptions
+        - A list of generation outputs
+        - A response from loading a lora adapter
+        """
         if outputs:
             try:
                 from ray.exceptions import RayTaskError
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index a066836b92708..f05ff62c4766b 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -270,3 +270,8 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         """Start profiling the engine"""
         ...
+
+    @abstractmethod
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        ...
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc1471e1f534d..925d7db43138b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -662,7 +662,7 @@ async def add_request_id(request: Request, call_next):
     return app
 
 
-def init_app_state(
+async def init_app_state(
     engine_client: EngineClient,
     model_config: ModelConfig,
     state: State,
@@ -690,12 +690,13 @@ def init_app_state(
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
     state.openai_serving_models = OpenAIServingModels(
+        engine_client=engine_client,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
     )
-    # TODO: The chat template is now broken for lora adapters :(
+    await state.openai_serving_models.init_static_loras()
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
@@ -794,7 +795,7 @@ def signal_handler(*_) -> None:
         app = build_app(args)
 
         model_config = await engine_client.get_model_config()
-        init_app_state(engine_client, model_config, app.state, args)
+        await init_app_state(engine_client, model_config, app.state, args)
 
         shutdown_task = await serve_http(
             app,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 822c0f5f7c211..f8f136f9d5024 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -215,6 +215,7 @@ async def main(args):
 
     # Create the openai serving objects.
     openai_serving_models = OpenAIServingModels(
+        engine_client=engine,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=None,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 26966896bc272..a222eafadcb68 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -5,15 +5,19 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
                                               UnloadLoraAdapterRequest)
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.utils import AtomicCounter
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class BaseModelPath:
@@ -45,6 +49,7 @@ class OpenAIServingModels:
 
     def __init__(
         self,
+        engine_client: EngineClient,
         model_config: ModelConfig,
         base_model_paths: List[BaseModelPath],
         *,
@@ -55,20 +60,11 @@ def __init__(
 
         self.base_model_paths = base_model_paths
         self.max_model_len = model_config.max_model_len
+        self.engine_client = engine_client
 
+        self.static_lora_modules = lora_modules
+        self.lora_requests: List[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
-        self.lora_requests = []
-        if lora_modules is not None:
-            self.lora_requests = [
-                LoRARequest(lora_name=lora.name,
-                            lora_int_id=i,
-                            lora_path=lora.path,
-                            base_model_name=lora.base_model_name
-                            if lora.base_model_name
-                            and self.is_base_model(lora.base_model_name) else
-                            self.base_model_paths[0].name)
-                for i, lora in enumerate(lora_modules, start=1)
-            ]
 
         self.prompt_adapter_requests = []
         if prompt_adapters is not None:
@@ -84,6 +80,19 @@ def __init__(
                         prompt_adapter_local_path=prompt_adapter.local_path,
                         prompt_adapter_num_virtual_tokens=num_virtual_tokens))
 
+    async def init_static_loras(self):
+        """Loads all static LoRA modules.
+        Raises if any fail to load"""
+        if self.static_lora_modules is None:
+            return
+        for lora in self.static_lora_modules:
+            load_request = LoadLoraAdapterRequest(lora_path=lora.path,
+                                                  lora_name=lora.name)
+            load_result = await self.load_lora_adapter(
+                request=load_request, base_model_name=lora.base_model_name)
+            if isinstance(load_result, ErrorResponse):
+                raise ValueError(load_result.message)
+
     def is_base_model(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
 
@@ -129,17 +138,47 @@ async def show_available_models(self) -> ModelList:
 
     async def load_lora_adapter(
             self,
-            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+            request: LoadLoraAdapterRequest,
+            base_model_name: Optional[str] = None
+    ) -> Union[ErrorResponse, str]:
         error_check_ret = await self._check_load_lora_adapter_request(request)
         if error_check_ret is not None:
             return error_check_ret
 
         lora_name, lora_path = request.lora_name, request.lora_path
         unique_id = self.lora_id_counter.inc(1)
-        self.lora_requests.append(
-            LoRARequest(lora_name=lora_name,
-                        lora_int_id=unique_id,
-                        lora_path=lora_path))
+        lora_request = LoRARequest(lora_name=lora_name,
+                                   lora_int_id=unique_id,
+                                   lora_path=lora_path)
+        if base_model_name is not None and self.is_base_model(base_model_name):
+            lora_request.base_model_name = base_model_name
+
+        # Validate that the adapter can be loaded into the engine
+        # This will also pre-load it for incoming requests
+        try:
+            await self.engine_client.add_lora(lora_request)
+        except ValueError as e:
+            # Adapter not found or lora configuration errors
+            if "No adapter found" in str(e):
+                return create_error_response(message=str(e),
+                                             err_type="NotFoundError",
+                                             status_code=HTTPStatus.NOT_FOUND)
+            else:
+                return create_error_response(
+                    message=str(e),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+        except BaseException as e:
+            # Some other unexpected problem loading the adapter, e.g. malformed
+            # input files.
+            # More detailed error messages for the user would be nicer here
+            return create_error_response(message=str(e),
+                                         err_type="BadRequestError",
+                                         status_code=HTTPStatus.BAD_REQUEST)
+
+        self.lora_requests.append(lora_request)
+        logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
+                    lora_path)
         return f"Success: LoRA adapter '{lora_name}' added successfully."
 
     async def unload_lora_adapter(
@@ -155,6 +194,7 @@ async def unload_lora_adapter(
             lora_request for lora_request in self.lora_requests
             if lora_request.lora_name != lora_name
         ]
+        logger.info("Removed LoRA adapter: name '%s'", lora_name)
         return f"Success: LoRA adapter '{lora_name}' removed successfully."
 
     async def _check_load_lora_adapter_request(
@@ -195,8 +235,8 @@ async def _check_unload_lora_adapter_request(
             return create_error_response(
                 message=
                 f"The lora adapter '{request.lora_name}' cannot be found.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
+                err_type="NotFoundError",
+                status_code=HTTPStatus.NOT_FOUND)
 
         return None
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 10976fac23028..eec462743fe9d 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -115,6 +115,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 embedding_padding_modules=self.embedding_padding_modules,
                 weights_mapper=hf_to_vllm_mapper)
 
+        except FileNotFoundError as e:
+            # FileNotFoundError should be raised if both
+            # - No adapter found to download from huggingface (or in
+            #       offline mode)
+            # - No local adapter files found at `lora_request.lora_path`
+            raise ValueError(
+                f"Loading lora {lora_request.lora_name} failed: No adapter "
+                f"found for {lora_path}") from e
         except Exception as e:
             raise RuntimeError(f"Loading lora {lora_path} failed") from e
         if lora.rank > self.lora_config.max_lora_rank:
@@ -209,12 +217,19 @@ def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
 
     def add_adapter(self, lora_request: LoRARequest) -> bool:
         if lora_request.lora_int_id not in self.list_adapters():
-            # Remove before we load the new lora to save memory
+            # Load the new adapter first to ensure it is actually valid, before
+            # evicting any existing adapters.
+            # This may cause the # of loaded lora adapters to very temporarily
+            # exceed `--max-cpu-loras`.
+            lora = self._load_adapter(lora_request)
+
+            # Loading succeeded, now check if we will exceed cache capacity and
+            # evict if the oldest adapter if so
             if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
                 assert isinstance(self._adapter_manager,
                                   LRUCacheLoRAModelManager)
                 self._adapter_manager.remove_oldest_adapter()
-            lora = self._load_adapter(lora_request)
+            # Then add the new adapter to the cache
             loaded = self._adapter_manager.add_adapter(lora)
         else:
             # If the lora is already loaded, just touch it to
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b963ba74f13f0..5daae45dee85c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -339,3 +339,7 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        raise NotImplementedError("LoRA not yet supported in V1")

From 61af6332565d0093855fee7266699e548b1c0d1c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 10 Jan 2025 16:20:46 +0800
Subject: [PATCH 44/55] [BUGFIX] Fix `UnspecifiedPlatform` package name
 (#11916)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/platforms/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index f6ac14446c021..6ca95b41dbb07 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -179,7 +179,7 @@ def resolve_current_platform_cls_qualname() -> str:
         logger.info("Automatically detected platform %s.",
                     activated_builtin_plugins[0])
     else:
-        platform_cls_qualname = "vllm.interface.UnspecifiedPlatform"
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
         logger.info(
             "No platform detected, vLLM is running on UnspecifiedPlatform")
     return platform_cls_qualname

From d53575a5f0e5c0f9003b4ec6e33c8bf761e93cef Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 16:25:17 +0800
Subject: [PATCH 45/55] [ci] fix gh200 tests (#11919)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 11d5fd7135d9e..9cfcdbf620d2b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -29,7 +29,9 @@
 
 try:
     from runai_model_streamer import SafetensorsStreamer
-except ImportError:
+except (ImportError, OSError):
+    # see https://github.com/run-ai/runai-model-streamer/issues/26
+    # OSError will be raised on arm64 platform
     runai_model_streamer = PlaceholderModule(
         "runai_model_streamer")  # type: ignore[assignment]
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(

From d907be7dc7926e64d6240bf4425d7399eaed150e Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:18:25 +0800
Subject: [PATCH 46/55] [misc] remove python function call for custom
 activation op (#11885)

Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py                      | 27 --------
 vllm/model_executor/layers/activation.py | 79 ++++++++++++++----------
 2 files changed, 46 insertions(+), 60 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index afb350591e562..d04cbbc0a9eed 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -34,33 +34,6 @@ def register_fake(fn):
         from torch.library import impl_abstract as register_fake
 
 
-# activation ops
-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_and_mul(out, x)
-
-
-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_tanh_and_mul(out, x)
-
-
-def fatrelu_and_mul(out: torch.Tensor,
-                    x: torch.Tensor,
-                    threshold: float = 0.0) -> None:
-    torch.ops._C.fatrelu_and_mul(out, x, threshold)
-
-
-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_fast(out, x)
-
-
-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_new(out, x)
-
-
-def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_quick(out, x)
-
-
 # page attention ops
 def paged_attention_v1(
     out: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 32456fee06a28..2475190d197d3 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -30,6 +30,8 @@ class FatreluAndMul(CustomOp):
     def __init__(self, threshold: float = 0.):
         super().__init__()
         self.threshold = threshold
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.fatrelu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -39,12 +41,10 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return x1 * x2
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.fatrelu_and_mul(out, x, self.threshold)
+        self.op(out, x, self.threshold)
         return out
 
 
@@ -103,6 +103,17 @@ def __init__(self, approximate: str = "none"):
         self.approximate = approximate
         if approximate not in ("none", "tanh"):
             raise ValueError(f"Unknown approximate mode: {approximate}")
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            if approximate == "none":
+                self.op = torch.ops._C.gelu_and_mul
+            elif approximate == "tanh":
+                self.op = torch.ops._C.gelu_tanh_and_mul
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            if approximate == "none":
+                self.op = ipex_ops.gelu_and_mul
+            else:
+                self.op = ipex_ops.gelu_tanh_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
@@ -110,27 +121,17 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def extra_repr(self) -> str:
@@ -140,6 +141,14 @@ def extra_repr(self) -> str:
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_new
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_new
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         c = math.sqrt(2.0 / math.pi)
@@ -147,58 +156,62 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
                                            (x + 0.044715 * torch.pow(x, 3.0))))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_new(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_new(x)
+        return self.op(x)
 
 
 @CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_fast
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_fast
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                            (1.0 + 0.044715 * x * x)))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_fast(x)
+        return self.op(x)
 
 
 @CustomOp.register("quick_gelu")
 class QuickGELU(CustomOp):
     # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_quick
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_quick
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return x * torch.sigmoid(1.702 * x)
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     # TODO implement forward_xpu for QuickGELU

From ef725feafcc1f2d6763cc888751fb2b36840587b Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 10 Jan 2025 18:02:38 +0800
Subject: [PATCH 47/55] [platform] support pytorch custom op pluggable (#11328)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/model_executor/custom_op.py | 7 +++++++
 vllm/platforms/interface.py      | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 401606e8c76f0..96995c56bf504 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -57,6 +57,11 @@ def forward_hpu(self, *args, **kwargs):
         # PyTorch-native implementation.
         return self.forward_native(*args, **kwargs)
 
+    def forward_oot(self, *args, **kwargs):
+        # By default, we assume that OOT ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
     def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
@@ -81,6 +86,8 @@ def dispatch_forward(self):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
+        elif current_platform.is_out_of_tree():
+            return self.forward_oot
         else:
             return self.forward_cuda
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f440358f65fbb..01d753408e6d0 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -45,6 +45,7 @@ class PlatformEnum(enum.Enum):
     CPU = enum.auto()
     NEURON = enum.auto()
     OPENVINO = enum.auto()
+    OOT = enum.auto()
     UNSPECIFIED = enum.auto()
 
 
@@ -107,6 +108,9 @@ def is_neuron(self) -> bool:
     def is_openvino(self) -> bool:
         return self._enum == PlatformEnum.OPENVINO
 
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+
     def is_cuda_alike(self) -> bool:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

From d85c47d6ad24c286ae55fd9da231808b8ddd7a7f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:05:56 +0000
Subject: [PATCH 48/55] Replace "online inference" with "online serving"
 (#11923)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                                    | 2 +-
 docs/source/features/structured_outputs.md                    | 4 ++--
 docs/source/getting_started/installation/hpu-gaudi.md         | 4 ++--
 docs/source/getting_started/quickstart.md                     | 2 +-
 docs/source/models/generative_models.md                       | 2 +-
 docs/source/models/pooling_models.md                          | 2 +-
 docs/source/models/supported_models.md                        | 4 ++--
 docs/source/serving/multimodal_inputs.md                      | 2 +-
 .../openai_chat_completion_client_for_multimodal.py           | 4 ++--
 tests/models/decoder_only/audio_language/test_ultravox.py     | 4 ++--
 vllm/model_executor/models/molmo.py                           | 2 +-
 11 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 1a4dae8f65e99..5a285be039393 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -61,7 +61,7 @@ function cpu_tests() {
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
-  # online inference
+  # online serving
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index ccd9a6a1b1a14..a42c3dd64ad10 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -5,7 +5,7 @@
 vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.
 
-## Online Inference (OpenAI API)
+## Online Serving (OpenAI API)
 
 You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
 
@@ -239,7 +239,7 @@ The main available options inside `GuidedDecodingParams` are:
 - `backend`
 - `whitespace_pattern`
 
-These parameters can be used in the same way as the parameters from the Online Inference examples above.
+These parameters can be used in the same way as the parameters from the Online Serving examples above.
 One example for the usage of the `choices` parameter is shown below:
 
 ```python
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 1d50cef3bdc83..21822327c8822 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -83,7 +83,7 @@ $ python setup.py develop
 ## Supported Features
 
 - [Offline inference](#offline-inference)
-- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
+- Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
@@ -385,5 +385,5 @@ the below:
   completely. With HPU Graphs disabled, you are trading latency and
   throughput at lower batches for potentially higher throughput on
   higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online inference), or by passing `enforce_eager=True`
+  server (for online serving), or by passing `enforce_eager=True`
   argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index ea15d9ef065fa..d7d43785c6c24 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -5,7 +5,7 @@
 This guide will help you quickly get started with vLLM to perform:
 
 - [Offline batched inference](#quickstart-offline)
-- [Online inference using OpenAI-compatible server](#quickstart-online)
+- [Online serving using OpenAI-compatible server](#quickstart-online)
 
 ## Prerequisites
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index a9f74c4d3fbb8..6a5a58ad74ab7 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -118,7 +118,7 @@ print("Loaded chat template:", custom_template)
 outputs = llm.chat(conversation, chat_template=custom_template)
 ```
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 745f3fd81980d..324b1f550e694 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -127,7 +127,7 @@ print(f"Score: {score}")
 
 A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index acbe27a22a679..72910ea1e2d19 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -552,7 +552,7 @@ See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the mod
 
 ````{important}
 To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
 ```python
@@ -562,7 +562,7 @@ llm = LLM(
 )
 ```
 
-Online inference:
+Online serving:
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 9f5e1b908d786..7e96ed46f2dcc 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -199,7 +199,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-## Online Inference
+## Online Serving
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 213d075542e81..03cc037bb6779 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,5 +1,5 @@
 """An example showing how to use vLLM to serve multimodal models 
-and run online inference with OpenAI client.
+and run online serving with OpenAI client.
 
 Launch the vLLM server with the following command:
 
@@ -309,7 +309,7 @@ def main(args) -> None:
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
-        description='Demo on using OpenAI client for online inference with '
+        description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
     parser.add_argument('--chat-type',
                         '-c',
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 0bb98df1b58e6..1e329dc4cb22e 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -237,8 +237,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
 
 
 @pytest.mark.asyncio
-async def test_online_inference(client, audio_assets):
-    """Exercises online inference with/without chunked prefill enabled."""
+async def test_online_serving(client, audio_assets):
+    """Exercises online serving with/without chunked prefill enabled."""
 
     messages = [{
         "role":
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 2e60bc719f096..c45ee9b921c9e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1068,7 +1068,7 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
         trust_remote_code=model_config.trust_remote_code)
 
     # NOTE: message formatting for raw text prompt is only applied for
-    # offline inference; for online inference, the prompt is always in
+    # offline inference; for online serving, the prompt is always in
     # instruction format and tokenized.
     if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$",
                                        prompt):

From 241ad7b301facac0728e2b3312d71fe47acc8c9e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 20:45:33 +0800
Subject: [PATCH 49/55] [ci] Fix sampler tests (#11922)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml |  1 +
 tests/conftest.py             | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e288f8f30159a..7d13269540864 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -214,6 +214,7 @@ steps:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
   - tests/samplers
+  - tests/conftest.py
   commands:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/tests/conftest.py b/tests/conftest.py
index 917151ddcb8d4..95af4ac1eb17b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -28,12 +28,13 @@
                               init_distributed_environment,
                               initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+                         TokensPrompt, to_enc_dec_tuple_list,
+                         zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity)
+                        identity, is_list_of)
 
 logger = init_logger(__name__)
 
@@ -886,6 +887,12 @@ def generate_beam_search(
         beam_width: int,
         max_tokens: int,
     ) -> List[Tuple[List[List[int]], List[str]]]:
+        if is_list_of(prompts, str, check="all"):
+            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
+        else:
+            prompts = [
+                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
+            ]
         outputs = self.model.beam_search(
             prompts,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))

From 12664ddda522b3a22c5b71eca9b2c907e3a687b3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 22:30:25 +0800
Subject: [PATCH 50/55] [Doc] [1/N] Initial guide for merged multi-modal
 processor (#11925)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/requirements-docs.txt                    |   1 +
 docs/source/api/multimodal/index.md           |   2 +-
 docs/source/api/multimodal/inputs.md          |   2 +-
 docs/source/conf.py                           |   1 +
 docs/source/contributing/model/index.md       |   2 +-
 docs/source/contributing/model/multimodal.md  | 380 +++++++++++++++---
 .../source/contributing/model/registration.md |   2 +-
 .../input_processing_pipeline.md              |  19 -
 .../input_processing/model_inputs_index.md    |  43 --
 docs/source/design/mm_processing.md           |  64 +++
 docs/source/index.md                          |   2 +-
 docs/source/serving/multimodal_inputs.md      |   2 +-
 vllm/config.py                                |   3 +-
 vllm/inputs/__init__.py                       |   3 -
 vllm/inputs/registry.py                       |  12 +-
 vllm/multimodal/__init__.py                   |   4 +-
 vllm/multimodal/base.py                       |  14 -
 vllm/multimodal/inputs.py                     |   3 +-
 vllm/multimodal/registry.py                   |  12 +-
 19 files changed, 403 insertions(+), 168 deletions(-)
 delete mode 100644 docs/source/design/input_processing/input_processing_pipeline.md
 delete mode 100644 docs/source/design/input_processing/model_inputs_index.md
 create mode 100644 docs/source/design/mm_processing.md

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 64cf6ef8fc19d..8217bc3ba3ded 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
+sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
 msgspec
 cloudpickle
diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 51e24795a34cf..14efdb506d76f 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -7,7 +7,7 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
 
 ## Module Contents
 
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
index 3d89666113229..76b2fb95a5009 100644
--- a/docs/source/api/multimodal/inputs.md
+++ b/docs/source/api/multimodal/inputs.md
@@ -3,7 +3,7 @@
 ## User-facing inputs
 
 ```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
+.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
 ```
 
 ## Internal data structures
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1ce11fe057071..bff0141ffbce8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,7 @@
     "sphinx.ext.autosummary",
     "myst_parser",
     "sphinxarg.ext",
+    "sphinx_design",
     "sphinx_togglebutton",
 ]
 myst_enable_extensions = [
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index a2d601c83cf47..245e13b795ec4 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -2,7 +2,7 @@
 
 # Adding a New Model
 
-This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 
 ```{toctree}
 :caption: Contents
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index e5dcd1223b361..76ab73e43d24b 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -1,6 +1,6 @@
-(enabling-multimodal-inputs)=
+(supports-multimodal)=
 
-# Enabling Multimodal Inputs
+# Multi-Modal Support
 
 This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
 
@@ -37,103 +37,355 @@ Further update the model as follows:
     ) -> SamplerOutput:
   ```
 
-## 2. Register input mappers
+## 2. Specify processing information
 
-For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
-This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`.
+Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`
+to provide basic information related to HF processing.
 
-```diff
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-+ from vllm.multimodal import MULTIMODAL_REGISTRY
+### Maximum number of input items
 
-+ @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits`
+to return the maximum number of input items for each modality supported by the model.
+
+For example, if the model supports any number of images but only one video per prompt:
+
+```python
+def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    return {"image": None, "video": 1}
 ```
 
-A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+### Maximum number of placeholder feature tokens
+
+Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
+to return the maximum number of placeholder feature tokens per input item for each modality.
+
+When calling the model, the output embeddings from the visual encoder are assigned to the input positions
+containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
+to the size of the output embeddings.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at the code of HF's `LlavaForConditionalGeneration`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+n_image_features = image_features.shape[0] * image_features.shape[1]
+
+if n_image_tokens != n_image_features:
+    raise ValueError(
+        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+    )
+special_image_mask = (
+    (input_ids == self.config.image_token_index)
+    .unsqueeze(-1)
+    .expand_as(inputs_embeds)
+    .to(inputs_embeds.device)
+)
+image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+```
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+The number of placeholder feature tokens per image is `image_features.shape[1]`.
+`image_features` is calculated inside the `get_image_features` method:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+if vision_feature_select_strategy == "default":
+    selected_image_feature = selected_image_feature[:, 1:]
+elif vision_feature_select_strategy == "full":
+    selected_image_feature = selected_image_feature
+else:
+    raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+image_features = self.multi_modal_projector(selected_image_feature)
+return image_features
 ```
 
-## 3. Register maximum number of multi-modal tokens
+We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
+(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
+Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
+The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
+mechanism doesn't change the sequence length of the output hidden states.
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
+hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+hidden_states = self.pre_layrnorm(hidden_states)
+
+encoder_outputs = self.encoder(
+    inputs_embeds=hidden_states,
+    output_attentions=output_attentions,
+    output_hidden_states=output_hidden_states,
+    return_dict=return_dict,
+)
+```
 
-For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
-and register it via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
+To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+target_dtype = self.patch_embedding.weight.dtype
+patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+if interpolate_pos_encoding:
+    embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+else:
+    embeddings = embeddings + self.position_embedding(self.position_ids)
+return embeddings
+```
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+We can infer that `embeddings.shape[1] == self.num_positions`, where
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-+ @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
+self.num_patches = (self.image_size // self.patch_size) ** 2
+self.num_positions = self.num_patches + 1
 ```
 
-Here are some examples:
+Overall, the number of placeholder feature tokens for an image can be calculated as:
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+```python
+def get_num_image_tokens(
+    self,
+    *,
+    image_width: int,
+    image_height: int,
+) -> int:
+    hf_config = self.get_hf_config()
+    hf_processor = self.get_hf_processor()
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+
+    num_image_tokens = (image_size // patch_size) ** 2 + 1
+    if hf_processor.vision_feature_select_strategy == "default":
+        num_image_tokens -= 1
+
+    return num_image_tokens
 ```
 
-## 4. (Optional) Register dummy data
+Notice that the number of image tokens doesn't depend on the image width and height.
+So, we can calculate the maximum number of image tokens using any image size:
 
-During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
-In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+```python
+def get_image_size_with_most_features(self) -> ImageSize:
+    hf_config = self.get_hf_config()
+    width = height = hf_config.image_size
+    return ImageSize(width=width, height=height)
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+def get_max_image_tokens(self) -> int:
+    target_width, target_height = self.get_image_size_with_most_features()
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-+ @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+    return self.get_num_image_tokens(
+        image_width=target_width,
+        image_height=target_height,
+    )
+```
+
+And thus, we can override the method as:
+
+```python
+def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    return {"image": self.get_max_image_tokens()}
 ```
 
 ```{note}
-The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
+Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 ```
+:::
+::::
+
+## 3. Specify dummy inputs
+
+Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
+HF processing as well as memory profiling.
+
+### For memory profiling
+
+Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
+to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
+the model so that vLLM can reserve the correct amount of memory for it.
+
+Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
+on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+Making use of the `get_image_size_with_most_features` method implemented in the previous section:
+
+```python
+def get_dummy_processor_inputs(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> ProcessorInputs:
+    num_images = mm_counts.get("image", 0)
+
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+  
+    hf_config = self.get_hf_config()
+    target_width, target_height = self.info.get_image_size_with_most_features()
+
+    mm_data = {
+        "image":
+        self._get_dummy_images(width=target_width,
+                               height=target_height,
+                               num_images=num_images)
+    }
+
+    return ProcessorInputs(
+        prompt_text=image_token * num_images,
+        mm_data=mm_data,
+    )
+```
+:::
+::::
 
-Here are some examples:
+## 4. Specify processing details
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
+to fill in the missing details about HF processing.
 
 ```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+[Multi-Modal Data Processing](#mm-processing)
 ```
 
-## 5. (Optional) Register input processor
+### Multi-modal fields
+
+Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
+return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at the model's `forward` method:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404
+def forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    vision_feature_layer: Optional[int] = None,
+    vision_feature_select_strategy: Optional[str] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: int = 0,
+) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+```
 
-Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor.
-This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call.
-You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+The only related keyword argument is `pixel_values` which directly corresponds to input images.
+The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images.
+So, we override the method as follows:
+
+```python
+def _get_mm_fields_config(
+    self,
+    hf_inputs: BatchFeature,
+    hf_processor_mm_kwargs: Mapping[str, object],
+) -> Mapping[str, MultiModalFieldConfig]:
+    return dict(
+        pixel_values=MultiModalFieldConfig.batched("image"),
+    )
+```
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+```{note}
+Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
+```
+:::
+::::
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-+ @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+### Prompt replacements
+
+Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
+return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
+
+Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
+operation performed by the HF processor.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at HF's `LlavaProcessor`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
+prompt_strings = []
+for sample in text:
+    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+    prompt_strings.append(sample)
 ```
 
-A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
-Here are some examples:
+It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
+Based on this, we override the method as follows:
+
+```python
+def _get_prompt_replacements(
+    self,
+    mm_items: MultiModalDataItems,
+    hf_processor_mm_kwargs: Mapping[str, object],
+    out_mm_kwargs: MultiModalKwargs,
+) -> list[PromptReplacement]:
+    hf_config = self.info.get_hf_config()
+    image_token_id = hf_config.image_token_index
+
+    def get_replacement(item_idx: int):
+        images = mm_items.get_items("image", ImageProcessorItems)
+
+        image_size = images.get_image_size(item_idx)
+        num_image_tokens = self.info.get_num_image_tokens(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+
+        return [image_token_id] * num_image_tokens
+
+    return [
+        PromptReplacement(
+            modality="image",
+            target=[image_token_id],
+            replacement=get_replacement,
+        ),
+    ]
+```
+:::
+::::
 
-- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+## 5. Register processor-related classes
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),
+{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3),
+and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4),
+decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
+to register them to the multi-modal registry:
+
+```diff
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
++ from vllm.multimodal import MULTIMODAL_REGISTRY
+
++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
++                                         info=YourProcessingInfo,
++                                         dummy_inputs=YourDummyInputsBuilder)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index fe5aa94c52896..6a9262669cd29 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -48,7 +48,7 @@ ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCaus
 
 ```{important}
 If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-Read more about that [here](#enabling-multimodal-inputs).
+Read more about that [here](#supports-multimodal).
 ```
 
 ```{note}
diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md
deleted file mode 100644
index bb16920e3d0c0..0000000000000
--- a/docs/source/design/input_processing/input_processing_pipeline.md
+++ /dev/null
@@ -1,19 +0,0 @@
-(input-processing-pipeline)=
-
-# Input Processing Pipeline
-
-1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`).
-
-2. Tokenize the data if necessary.
-
-3. Process the inputs using {meth}`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
-
-   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
-
-4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`.
-
-5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`.
-
-6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
-
-   - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md
deleted file mode 100644
index cb415366e5a66..0000000000000
--- a/docs/source/design/input_processing/model_inputs_index.md
+++ /dev/null
@@ -1,43 +0,0 @@
-(input-processing)=
-
-# Input Processing
-
-```{eval-rst}
-.. currentmodule:: vllm.inputs
-```
-
-Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via
-{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-
-Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input
-data in addition to input prompt, but it can be extended to text-only language models when needed.
-
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-input_processing_pipeline
-```
-
-## Module Contents
-
-### LLM Engine Inputs
-
-```{eval-rst}
-.. autoclass:: vllm.inputs.DecoderOnlyInputs
-    :members:
-    :show-inheritance:
-```
-
-### Registry
-
-```{eval-rst}
-.. autodata:: vllm.inputs.INPUT_REGISTRY
-```
-
-```{eval-rst}
-.. automodule:: vllm.inputs.registry
-    :members:
-    :show-inheritance:
-```
diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
new file mode 100644
index 0000000000000..a0d01205e638c
--- /dev/null
+++ b/docs/source/design/mm_processing.md
@@ -0,0 +1,64 @@
+(mm-processing)=
+
+# Multi-Modal Data Processing
+
+To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+
+Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
+
+## Prompt Replacement Detection
+
+One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
+
+In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt.
+
+## Tokenized Prompt Inputs
+
+To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data.
+
+### The problem
+
+Consider that HF processors follow these main steps:
+
+1. Tokenize the text
+2. Process multi-modal inputs
+3. Perform prompt replacement
+
+And we require that:
+
+- For text + multi-modal inputs, apply all steps 1--3.
+- For tokenized + multi-modal inputs, apply only steps 2--3.
+
+How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs:
+
+- For text + multi-modal inputs, simply call the HF processor directly.
+- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs.
+
+While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs.
+
+Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
+
+(mm-dummy-text)=
+
+### Dummy text
+
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+
+(mm-automatic-prompt-replacement)=
+
+### Automatic prompt replacement
+
+We address the second issue by implementing model-agnostic code in
+{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`.
+
+### Summary
+
+With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
+
+## Processor Output Caching
+
+Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+
+When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
+
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other.
diff --git a/docs/source/index.md b/docs/source/index.md
index 356fa4b7fd573..de74276a50fb6 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -145,7 +145,7 @@ design/arch_overview
 design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
-design/input_processing/model_inputs_index
+design/mm_processing
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 7e96ed46f2dcc..a06f121a6899a 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -14,7 +14,7 @@ and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/ch
 To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
 
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
-- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`.
+- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
 
 ### Image
 
diff --git a/vllm/config.py b/vllm/config.py
index 13b5390008a35..59b509d5a961e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2124,8 +2124,7 @@ class MultiModalConfig:
 
     limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
     """
-    The maximum number of multi-modal input instances allowed per prompt
-    for each :class:`~vllm.multimodal.MultiModalPlugin`.
+    The maximum number of input items allowed per prompt for each modality.
     """
 
     def compute_hash(self) -> str:
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index aaeecab7ffde1..a0dd89f69bacd 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -11,9 +11,6 @@
 """
 The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
 to dispatch data processing according to the target model.
-
-See also:
-    :ref:`input-processing-pipeline`
 """
 
 __all__ = [
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index aad0dfab94a01..4b73ade7af5f0 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -313,9 +313,6 @@ def dummy_data_for_profiling(
 
         The model is identified by ``model_config``.
 
-        See also:
-            :ref:`enabling-multimodal-inputs`
-
         Note:
             This should be called after
             :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
@@ -384,10 +381,8 @@ def register_input_processor(self, processor: InputProcessor):
         Register an input processor to a model class.
 
         The provided function is invoked on each input to the model. This
-        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
-
-        See also:
-            :ref:`input-processing-pipeline`
+        happens before
+        :meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
         """
 
         def wrapper(model_cls: N) -> N:
@@ -429,9 +424,6 @@ def process_input(self, model_config: "ModelConfig",
         Apply an input processor to an instance of model inputs.
 
         The model is identified by ``model_config``.
-
-        See also:
-            :ref:`input-processing-pipeline`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 343b9322ecc5e..1d7f5d57fa24e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -8,10 +8,10 @@
 MULTIMODAL_REGISTRY = MultiModalRegistry()
 """
 The global :class:`~MultiModalRegistry` is used by model runners to
-dispatch data processing according to its modality and the target model.
+dispatch data processing according to the target model.
 
 See also:
-    :ref:`input-processing-pipeline`
+    :ref:`mm-processing`
 """
 
 __all__ = [
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4941fbac963ca..fd3ec7e0ec8ce 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -90,10 +90,6 @@ def register_input_mapper(
         invoked to transform the data into a dictionary of model inputs.
 
         If `None` is provided, then the default input mapper is used instead.
-
-        See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -126,10 +122,6 @@ def map_input(
 
         Raises:
             TypeError: If the data type is not supported.
-
-        See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
         """
 
         # Avoid circular import
@@ -186,9 +178,6 @@ def register_max_multimodal_tokens(
         for a model class.
 
         If `None` is provided, then the default calculation is used instead.
-
-        See also:
-            :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -218,9 +207,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         If this registry is not applicable to the model, `0` is returned.
 
         The model is identified by ``model_config``.
-
-        See also:
-            :ref:`enabling-multimodal-inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8680e4175593b..4b63703585214 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -493,7 +493,8 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
 class MultiModalInputsV2(TypedDict):
     """
-    Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`,
+    Represents the outputs of
+    :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
     ready to be passed to vLLM internals.
     """
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 9eceefb08c93f..804a91da8c889 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -100,8 +100,7 @@ def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
 
 class MultiModalRegistry:
     """
-    A registry that dispatches data processing to the
-    :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
+    A registry that dispatches data processing according to the model.
     """
 
     DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
@@ -367,8 +366,7 @@ def register_processor(
         invoked to transform the data into a dictionary of model inputs.
 
         See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
+            :ref:`mm-processing`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -398,6 +396,9 @@ def _get_model_cls(self, model_config: "ModelConfig"):
     def has_processor(self, model_config: "ModelConfig") -> bool:
         """
         Test whether a multi-modal processor is defined for a specific model.
+
+        See also:
+            :ref:`mm-processing`
         """
         return self._get_model_cls(model_config) in self._processor_factories
 
@@ -408,6 +409,9 @@ def create_processor(
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
+
+        See also:
+            :ref:`mm-processing`
         """
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]

From 20410b2fdac1818ead453018fb07c2ff90ee6770 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 10 Jan 2025 23:46:51 +0800
Subject: [PATCH 51/55] [platform] support custom torch.compile backend key
 (#11318)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/layers/rejection_sampler.py        | 3 ++-
 vllm/model_executor/layers/vocab_parallel_embedding.py | 2 +-
 vllm/model_executor/models/commandr.py                 | 3 ++-
 vllm/model_executor/models/phi3_small.py               | 5 +++--
 vllm/platforms/interface.py                            | 6 ++++++
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index f173cbde03f44..9d6c3797c62fc 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -9,6 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeStochasticBaseSampler)
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -368,7 +369,7 @@ def _smallest_positive_value(self) -> float:
 # Note that we always sample with replacement.
 # probs will be modified in place, but this is fine, as we pass
 # in a copy already.
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 30548e656c557..65920aa61ba15 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -133,7 +133,7 @@ def __post_init__(self):
         assert self.num_added_elements <= self.num_added_elements_padded
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def get_masked_input_and_mask(
         input_: torch.Tensor, org_vocab_start_index: int,
         org_vocab_end_index: int, num_org_vocab_padding: int,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index d22d1f3171463..8d61ece289412 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -45,6 +45,7 @@
     row_parallel_weight_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -53,7 +54,7 @@
                     maybe_prefix)
 
 
-@torch.compile
+@torch.compile(backend=current_platform.simple_compile_backend)
 def layer_norm_func(hidden_states, weight, variance_epsilon):
     input_dtype = hidden_states.dtype
     hidden_states = hidden_states.to(torch.float32)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index da7e4cdbc6940..f47676b934e4e 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -20,6 +20,7 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
@@ -54,12 +55,12 @@ def weight_loader(self, param: torch.nn.Parameter,
         return load_column_parallel_weight(param, loaded_weight)
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def quick_gelu(x):
     return x * torch.sigmoid(1.702 * x)
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def gegelu(input, limit: Optional[float] = None):
     a_gelu, a_linear = input[..., ::2], input[..., 1::2]
     if limit is not None:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 01d753408e6d0..fe398801c5dd9 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -82,6 +82,12 @@ class Platform:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
     supported_quantization: list[str] = []
 
     def is_cuda(self) -> bool:

From 482cdc494e608b72303f49b56532f5c50b61cbdb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:50:29 +0000
Subject: [PATCH 52/55] [Doc] Rename offline inference examples (#11927)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-neuron-test.sh                 |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-tpu-test.sh                    |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 ++--
 .buildkite/test-pipeline.yaml                 | 20 +++++++++----------
 .../contributing/profiling/profiling_index.md |  2 +-
 docs/source/features/structured_outputs.md    |  2 +-
 .../getting_started/installation/cpu-x86.md   |  4 ++--
 docs/source/getting_started/quickstart.md     |  2 +-
 docs/source/models/generative_models.md       |  4 ++--
 docs/source/models/pooling_models.md          |  6 +++---
 docs/source/serving/multimodal_inputs.md      |  8 ++++----
 ...{offline_inference_arctic.py => arctic.py} |  0
 ...ce_audio_language.py => audio_language.py} |  0
 .../{offline_inference.py => basic.py}        |  0
 ...y => basic_with_model_default_sampling.py} |  0
 .../{offline_inference_chat.py => chat.py}    |  0
 ..._chat_with_tools.py => chat_with_tools.py} |  0
 ...ce_classification.py => classification.py} |  0
 .../{offline_inference_cli.py => cli.py}      |  0
 ...nference_distributed.py => distributed.py} |  0
 ...ne_inference_embedding.py => embedding.py} |  0
 ..._encoder_decoder.py => encoder_decoder.py} |  0
 .../offline_inference/florence2_inference.py  |  2 +-
 ...ence_mlpspeculator.py => mlpspeculator.py} |  0
 ...{offline_inference_neuron.py => neuron.py} |  0
 ...ization.py => neuron_int8_quantization.py} |  0
 .../openai_batch.md}                          | 18 ++++++++---------
 .../openai_example_batch.jsonl                |  0
 ...ffline_inference_pixtral.py => pixtral.py} |  0
 ...rence_with_prefix.py => prefix_caching.py} |  0
 .../{offline_profile.py => profiling.py}      |  2 +-
 ...ffline_inference_scoring.py => scoring.py} |  0
 ...e_with_profiler.py => simple_profiling.py} |  0
 ...tured_outputs.py => structured_outputs.py} |  0
 .../{offline_inference_tpu.py => tpu.py}      |  0
 ..._vision_language.py => vision_language.py} |  0
 ...edding.py => vision_language_embedding.py} |  0
 ...mage.py => vision_language_multi_image.py} |  0
 ...ffline_inference_whisper.py => whisper.py} |  0
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py |  2 +-
 46 files changed, 46 insertions(+), 46 deletions(-)
 rename examples/offline_inference/{offline_inference_arctic.py => arctic.py} (100%)
 rename examples/offline_inference/{offline_inference_audio_language.py => audio_language.py} (100%)
 rename examples/offline_inference/{offline_inference.py => basic.py} (100%)
 rename examples/offline_inference/{offline_inference_with_default_generation_config.py => basic_with_model_default_sampling.py} (100%)
 rename examples/offline_inference/{offline_inference_chat.py => chat.py} (100%)
 rename examples/offline_inference/{offline_chat_with_tools.py => chat_with_tools.py} (100%)
 rename examples/offline_inference/{offline_inference_classification.py => classification.py} (100%)
 rename examples/offline_inference/{offline_inference_cli.py => cli.py} (100%)
 rename examples/offline_inference/{offline_inference_distributed.py => distributed.py} (100%)
 rename examples/offline_inference/{offline_inference_embedding.py => embedding.py} (100%)
 rename examples/offline_inference/{offline_inference_encoder_decoder.py => encoder_decoder.py} (100%)
 rename examples/offline_inference/{offline_inference_mlpspeculator.py => mlpspeculator.py} (100%)
 rename examples/offline_inference/{offline_inference_neuron.py => neuron.py} (100%)
 rename examples/offline_inference/{offline_inference_neuron_int8_quantization.py => neuron_int8_quantization.py} (100%)
 rename examples/offline_inference/{offline_inference_openai/offline_inference_openai.md => openai/openai_batch.md} (92%)
 rename examples/offline_inference/{offline_inference_openai => openai}/openai_example_batch.jsonl (100%)
 rename examples/offline_inference/{offline_inference_pixtral.py => pixtral.py} (100%)
 rename examples/offline_inference/{offline_inference_with_prefix.py => prefix_caching.py} (100%)
 rename examples/offline_inference/{offline_profile.py => profiling.py} (99%)
 rename examples/offline_inference/{offline_inference_scoring.py => scoring.py} (100%)
 rename examples/offline_inference/{offline_inference_with_profiler.py => simple_profiling.py} (100%)
 rename examples/offline_inference/{offline_inference_structured_outputs.py => structured_outputs.py} (100%)
 rename examples/offline_inference/{offline_inference_tpu.py => tpu.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language.py => vision_language.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language_embedding.py => vision_language_embedding.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language_multi_image.py => vision_language_multi_image.py} (100%)
 rename examples/offline_inference/{offline_inference_whisper.py => whisper.py} (100%)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 5a285be039393..4ae66f6f3215a 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference/offline_inference.py"
+    python3 examples/offline_inference/basic.py"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 1e5ff77895a38..3e4e409466b8a 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/basic.py
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index a50570ab53438..8f3b08212fd6a 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
\ No newline at end of file
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 52d485939b1d0..189714ebb6d75 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 380f7a44a429a..6159b21ff8206 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index a8f021890f742..650af0fac4c61 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
     && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
     && python3 /workspace/vllm/tests/tpu/test_compilation.py \
     && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 160e10aa3bb9b..4d344e58db8ac 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/offline_inference.py
-    python3 examples/offline_inference/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/cli.py -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7d13269540864..d3bd809cfdf24 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/basic.py
     - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference/offline_inference_chat.py
-    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/chat.py
+    - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/offline_inference_vision_language.py
-    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/vision_language_multi_image.py
     - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/offline_inference_encoder_decoder.py
-    - python3 offline_inference/offline_inference_classification.py
-    - python3 offline_inference/offline_inference_embedding.py
-    - python3 offline_inference/offline_inference_scoring.py
-    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/classification.py
+    - python3 offline_inference/embedding.py
+    - python3 offline_inference/scoring.py
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 97de40ff469f1..001db86bdf555 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 
 ### Offline Inference
 
-Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
 
 ### OpenAI Server
 
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index a42c3dd64ad10..1d77c7339a33f 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/structured_outputs.py>
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index bb046dd0fd9dc..f4d3eec0377b1 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference/offline_inference.py # run vLLM
+$ python examples/offline_inference/basic.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/offline_inference.py
+$ python examples/offline_inference/basic.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index d7d43785c6c24..6fd0083a9bb7b 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 6a5a58ad74ab7..e4b4cd03a90d2 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 324b1f550e694..91db694be29a4 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -88,7 +88,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
 
 ### `LLM.classify`
 
@@ -103,7 +103,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/classification.py>
 
 ### `LLM.score`
 
@@ -125,7 +125,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
 
 ## Online Serving
 
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index a06f121a6899a..53f5a274e39a3 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 ### Audio
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/audio_language.py>
 
 ### Embedding
 
diff --git a/examples/offline_inference/offline_inference_arctic.py b/examples/offline_inference/arctic.py
similarity index 100%
rename from examples/offline_inference/offline_inference_arctic.py
rename to examples/offline_inference/arctic.py
diff --git a/examples/offline_inference/offline_inference_audio_language.py b/examples/offline_inference/audio_language.py
similarity index 100%
rename from examples/offline_inference/offline_inference_audio_language.py
rename to examples/offline_inference/audio_language.py
diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/basic.py
similarity index 100%
rename from examples/offline_inference/offline_inference.py
rename to examples/offline_inference/basic.py
diff --git a/examples/offline_inference/offline_inference_with_default_generation_config.py b/examples/offline_inference/basic_with_model_default_sampling.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_default_generation_config.py
rename to examples/offline_inference/basic_with_model_default_sampling.py
diff --git a/examples/offline_inference/offline_inference_chat.py b/examples/offline_inference/chat.py
similarity index 100%
rename from examples/offline_inference/offline_inference_chat.py
rename to examples/offline_inference/chat.py
diff --git a/examples/offline_inference/offline_chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
similarity index 100%
rename from examples/offline_inference/offline_chat_with_tools.py
rename to examples/offline_inference/chat_with_tools.py
diff --git a/examples/offline_inference/offline_inference_classification.py b/examples/offline_inference/classification.py
similarity index 100%
rename from examples/offline_inference/offline_inference_classification.py
rename to examples/offline_inference/classification.py
diff --git a/examples/offline_inference/offline_inference_cli.py b/examples/offline_inference/cli.py
similarity index 100%
rename from examples/offline_inference/offline_inference_cli.py
rename to examples/offline_inference/cli.py
diff --git a/examples/offline_inference/offline_inference_distributed.py b/examples/offline_inference/distributed.py
similarity index 100%
rename from examples/offline_inference/offline_inference_distributed.py
rename to examples/offline_inference/distributed.py
diff --git a/examples/offline_inference/offline_inference_embedding.py b/examples/offline_inference/embedding.py
similarity index 100%
rename from examples/offline_inference/offline_inference_embedding.py
rename to examples/offline_inference/embedding.py
diff --git a/examples/offline_inference/offline_inference_encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
similarity index 100%
rename from examples/offline_inference/offline_inference_encoder_decoder.py
rename to examples/offline_inference/encoder_decoder.py
diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
index 49dd2c331db5a..c24096e90004b 100644
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -3,7 +3,7 @@
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference/offline_inference_vision_language.py
+# Move to offline_inference/vision_language.py
 # after porting vision backbone
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/offline_inference_mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
similarity index 100%
rename from examples/offline_inference/offline_inference_mlpspeculator.py
rename to examples/offline_inference/mlpspeculator.py
diff --git a/examples/offline_inference/offline_inference_neuron.py b/examples/offline_inference/neuron.py
similarity index 100%
rename from examples/offline_inference/offline_inference_neuron.py
rename to examples/offline_inference/neuron.py
diff --git a/examples/offline_inference/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
similarity index 100%
rename from examples/offline_inference/offline_inference_neuron_int8_quantization.py
rename to examples/offline_inference/neuron_int8_quantization.py
diff --git a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md b/examples/offline_inference/openai/openai_batch.md
similarity index 92%
rename from examples/offline_inference/offline_inference_openai/offline_inference_openai.md
rename to examples/offline_inference/openai/openai_batch.md
index 6278a1943fe4a..a4774e57cd9a5 100644
--- a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
  
 The OpenAI batch file format consists of a series of json objects on new lines.
  
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
  
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
@@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -66,10 +66,10 @@ $ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```
-aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl
similarity index 100%
rename from examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
rename to examples/offline_inference/openai/openai_example_batch.jsonl
diff --git a/examples/offline_inference/offline_inference_pixtral.py b/examples/offline_inference/pixtral.py
similarity index 100%
rename from examples/offline_inference/offline_inference_pixtral.py
rename to examples/offline_inference/pixtral.py
diff --git a/examples/offline_inference/offline_inference_with_prefix.py b/examples/offline_inference/prefix_caching.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_prefix.py
rename to examples/offline_inference/prefix_caching.py
diff --git a/examples/offline_inference/offline_profile.py b/examples/offline_inference/profiling.py
similarity index 99%
rename from examples/offline_inference/offline_profile.py
rename to examples/offline_inference/profiling.py
index 187a05e4d70a2..8a94b5c2a8623 100644
--- a/examples/offline_inference/offline_profile.py
+++ b/examples/offline_inference/profiling.py
@@ -363,7 +363,7 @@ def abort_requests():
 
     example:
     ```
-    python examples/offline_inference/offline_profile.py \\
+    python examples/offline_inference/profiling.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
         --enforce-eager run_num_steps -n 2
diff --git a/examples/offline_inference/offline_inference_scoring.py b/examples/offline_inference/scoring.py
similarity index 100%
rename from examples/offline_inference/offline_inference_scoring.py
rename to examples/offline_inference/scoring.py
diff --git a/examples/offline_inference/offline_inference_with_profiler.py b/examples/offline_inference/simple_profiling.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_profiler.py
rename to examples/offline_inference/simple_profiling.py
diff --git a/examples/offline_inference/offline_inference_structured_outputs.py b/examples/offline_inference/structured_outputs.py
similarity index 100%
rename from examples/offline_inference/offline_inference_structured_outputs.py
rename to examples/offline_inference/structured_outputs.py
diff --git a/examples/offline_inference/offline_inference_tpu.py b/examples/offline_inference/tpu.py
similarity index 100%
rename from examples/offline_inference/offline_inference_tpu.py
rename to examples/offline_inference/tpu.py
diff --git a/examples/offline_inference/offline_inference_vision_language.py b/examples/offline_inference/vision_language.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language.py
rename to examples/offline_inference/vision_language.py
diff --git a/examples/offline_inference/offline_inference_vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language_embedding.py
rename to examples/offline_inference/vision_language_embedding.py
diff --git a/examples/offline_inference/offline_inference_vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language_multi_image.py
rename to examples/offline_inference/vision_language_multi_image.py
diff --git a/examples/offline_inference/offline_inference_whisper.py b/examples/offline_inference/whisper.py
similarity index 100%
rename from examples/offline_inference/offline_inference_whisper.py
rename to examples/offline_inference/whisper.py
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 57518bd3e8299..69698b34c71a3 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,7 +5,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/offline_inference.py")
+        "examples", "offline_inference/basic.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 49366abc7fb56..54cd60c2bc95b 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
                         type=str,
                         required=True,
                         help="json trace file output by "
-                        "examples/offline_inference/offline_profile.py")
+                        "examples/offline_inference/profiling.py")
     parser.add_argument("--phase",
                         type=str,
                         required=True,
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index fa88ed4204d8f..cb56ebd69a8c1 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -538,7 +538,7 @@ def make_plot_title_suffix(profile_json: dict) -> str:
                         type=str,
                         required=True,
                         help="json trace file output by \
-                              examples/offline_inference/offline_profile.py")
+                              examples/offline_inference/profiling.py")
     parser.add_argument("--output-directory",
                         type=str,
                         required=False,

From f33e033e2782a9258d8ef6a359643944629d4ced Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 10 Jan 2025 23:51:02 +0800
Subject: [PATCH 53/55] [Docs] Fix docstring in `get_ip` function (#11932)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 8c3e5200b3d98..217ccb25cef6d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -446,7 +446,7 @@ def get_ip() -> str:
         logger.warning(
             "The environment variable HOST_IP is deprecated and ignored, as"
             " it is often used by Docker and other software to"
-            "interact with the container's network stack. Please"
+            "interact with the container's network stack. Please "
             "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
             " to communicate with each other.")
     if host_ip:

From 5959564f94180a6a50e0d394e35a035c0c98a7fb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 10 Jan 2025 23:51:43 +0800
Subject: [PATCH 54/55] Doc fix in `benchmark_long_document_qa_throughput.py`
 (#11933)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 benchmarks/benchmark_long_document_qa_throughput.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 13477ef535e86..0b8fba38156f1 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -2,8 +2,7 @@
 Offline benchmark to test the long document QA throughput.
 
 Example usage:
-    # This command run the vllm with 50GB CPU memory for offloading
-    # The workload samples 8 different prompts with a default input
+    # This workload samples 8 different prompts with a default input
     # length of 20000 tokens, then replicates each prompt 2 times 
     # in random order.
     python benchmark_long_document_qa_throughput.py \

From aa1e77a19ce658abcbaa0836f96878a7ae9dea84 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 11 Jan 2025 00:07:58 +0800
Subject: [PATCH 55/55] [Hardware][CPU] Support MOE models on x86 CPU (#11831)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../getting_started/installation/cpu-x86.md   |  2 +-
 .../decoder_only/language/test_models.py      |  4 ++
 vllm/model_executor/layers/fused_moe/layer.py | 41 +++++++++++++++++--
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index f4d3eec0377b1..26bdcd93ad190 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -5,7 +5,7 @@
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
 - Tensor Parallel
-- Model Quantization (`INT8 W8A8, AWQ`)
+- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
 - Chunked-prefill
 - Prefix-caching
 - FP8-E5M2 KV-Caching (TODO)
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 2a7ed8826d2f3..4e110366a09f3 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -48,6 +48,10 @@
         ),
         pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
+        pytest.param(
+            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
+            marks=[pytest.mark.cpu_model],
+        )
     ])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b108cbd52c218..cf5db368926b4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -13,6 +13,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
 
 if current_platform.is_cuda_alike():
     from .fused_moe import fused_experts
@@ -83,6 +84,20 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        if current_platform.is_cpu():
+            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                import intel_extension_for_pytorch as ipex
+                layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    use_prepack=True,
+                )
+            else:
+                raise NotImplementedError("CPU MOE only supports x86 arch.")
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -142,9 +157,29 @@ def forward_cuda(
                              topk_ids=topk_ids,
                              inplace=True)
 
-    def forward_cpu(self, *args, **kwargs):
-        raise NotImplementedError(
-            "The CPU backend currently does not support MoE.")
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        **kwargs,
+    ):
+        assert custom_routing_function is None
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+        )
 
     def forward_tpu(
         self,