diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 3db77d5f16022..64ba1b32fb074 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -21,7 +21,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
             command:
             - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
             resources:
@@ -51,7 +51,7 @@ steps:
       queue: H200
     plugins:
     - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
         command:
         - bash
         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -65,13 +65,18 @@ steps:
         - VLLM_USAGE_SOURCE
         - HF_TOKEN
 
+  - block: "Run H100 Benchmark"
+    key: block-h100
+    depends_on: ~
+
   - label: "H100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
+    depends_on: block-h100
     plugins:
     - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
         command:
         - bash
         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
index 19f7160e68a4d..aa0f7ade808e0 100644
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 
 TIMEOUT_SECONDS=10
 
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f78e360b7afd3..93e118fb3eab8 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,7 +1,7 @@
 steps:
   - label: "Build wheel - CUDA 12.1"
     agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
@@ -18,7 +18,7 @@ steps:
   - label: "Build wheel - CUDA 11.8"
     # depends_on: block-build-cu118-wheel
     agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
@@ -26,3 +26,16 @@ steps:
       - "bash .buildkite/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build release image"
+    depends_on: ~
+    key: block-release-image-build
+
+  - label: "Build release image"
+    depends_on: block-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index faeac8e2ded36..e0a12afbe7320 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -12,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
+# Run the image and test offline inference/tensor parallel
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+    python3 examples/offline_inference.py
+    python3 examples/offline_inference_cli.py -tp 2
+'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f5591f1098534..8f57006214c88 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,9 +50,9 @@ steps:
   - tests/multimodal
   - tests/test_utils
   - tests/worker
-  - tests/test_lazy_torch_compile.py
+  - tests/standalone_tests/lazy_torch_compile.py
   commands:
-  - python3 test_lazy_torch_compile.py
+  - python3 standalone_tests/lazy_torch_compile.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@@ -61,6 +61,13 @@ steps:
   - pytest -v -s test_utils.py # Utils
   - pytest -v -s worker # Worker
 
+- label: Python-only Installation Test
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
 - label: Basic Correctness Test # 30min
   #mirror_hardwares: [amd]
   fast_check: true
@@ -230,7 +237,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -355,6 +362,7 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
@@ -370,6 +378,7 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
@@ -481,7 +490,6 @@ steps:
 
 - label: LoRA TP Test (Distributed)
   num_gpus: 4
-  soft_fail: true
   source_file_dependencies:
   - vllm/lora
   - tests/lora
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 76dbd4c04d3f3..77162bc82de62 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -1,5 +1,6 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
 
 FROM $BASE_IMAGE
 
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index c3fed56e8a956..b67849038cf0d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -24,6 +24,7 @@ class RequestFuncInput:
     model: str
     best_of: int = 1
     logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
 
@@ -36,6 +37,7 @@ class RequestFuncOutput:
     ttft: float = 0.0  # Time to first token
     itl: List[float] = field(
         default_factory=list)  # List of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""
 
@@ -242,6 +244,8 @@ async def async_request_openai_completions(
             "stream": True,
             "ignore_eos": request_func_input.ignore_eos,
         }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
         }
@@ -336,6 +340,8 @@ async def async_request_openai_chat_completions(
             "stream": True,
             "ignore_eos": request_func_input.ignore_eos,
         }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
         headers = {
             "Content-Type": "application/json",
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
new file mode 100644
index 0000000000000..1a0e62598bfcb
--- /dev/null
+++ b/benchmarks/benchmark_guided.py
@@ -0,0 +1,494 @@
+"""Benchmark guided decoding throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from typing import List
+
+import datasets
+import pandas as pd
+import uvloop
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str = 'json'
+    completion: str = None
+
+
+def run_vllm(requests: List[SampleRequest],
+             engine_args: EngineArgs,
+             n: int,
+             guided_decoding_rate: float = 1.0,
+             warmup: bool = False) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**vars(engine_args))
+
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    # create a list containing random selected true or false
+    guided_decoding_req_idx = random.sample(
+        range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+    if warmup:
+        print(">>>>> Running warmup prompt, for the first 5")
+        # We setup the first 5 requests to warmup FSM
+        # if using xgrammar dataset, we will skip warmup
+        warmup_requests = requests[:5]
+        for i, request in enumerate(warmup_requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if guided_decoding_rate > 0 else None,
+                ))
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+
+    print(">>>>> Benchmark started...")
+    prompts = []
+    sampling_params = []
+    for i, request in enumerate(requests):
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                guided_decoding=GuidedDecodingParams(
+                    **{request.structure_type: request.schema})
+                if i in guided_decoding_req_idx else None,
+            ))
+
+    start = time.perf_counter()
+    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
+    ret = []
+    for output, request in zip(outputs, requests):
+        generated_text = output.outputs[0].text
+        ret.append({
+            "generated": generated_text,
+            "expected": request.completion
+        })
+    end = time.perf_counter()
+    return end - start, ret
+
+
+async def run_vllm_async(
+        requests: List[SampleRequest],
+        engine_args: AsyncEngineArgs,
+        n: int,
+        guided_decoding_rate: float = 1.0,
+        warmup: bool = False,
+        disable_frontend_multiprocessing: bool = False) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        guided_decoding_req_idx = random.sample(
+            range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+        if warmup:
+            print(">>>>>> Running warmup prompt, for the first 5")
+            # We setup the first 5 requests to warmup FSM
+            # if using xgrammar dataset, we will skip warmup
+            warmup_requests = requests[:5]
+            for i, request in enumerate(warmup_requests):
+                prompts.append(request.prompt)
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=1.0,
+                        top_p=1.0,
+                        ignore_eos=True,
+                        max_tokens=request.expected_output_len,
+                        guided_decoding=GuidedDecodingParams(
+                            json=request.schema)
+                        if guided_decoding_rate > 0 else None,
+                    ))
+            generators = []
+            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+                generator = llm.generate(prompt, sp, request_id=f"test{i}")
+                generators.append(generator)
+            all_gens = merge_async_iterators(*generators)
+            async for i, res in all_gens:
+                pass
+
+        print(">>>>> Benchmark started...")
+        prompts = []
+        sampling_params = []
+        for i, request in enumerate(requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if i in guided_decoding_req_idx else None,
+                ))
+
+        generators = []
+        start_time = []
+        latencies = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+            start_time.append(time.perf_counter())
+            latencies.append([])
+        all_gens = merge_async_iterators(*generators)
+        generated_texts = [''] * len(requests)
+        async for i, res in all_gens:
+            generated_texts[i] = res.outputs[0].text
+            lat = time.perf_counter() - start_time[i]
+            latencies[i].append(lat)
+        ret = [{
+            'generated': gt,
+            'expected': req.completion
+        } for gt, req in zip(generated_texts, requests)]
+        end = time.perf_counter()
+        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
+        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
+                                  for lat in latencies])
+        return end - start, ret, (first_latency, next_latency)
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+
+            ?select_statement: "SELECT " column_list " FROM " table_name
+
+            ?column_list: column_name ("," column_name)*
+
+            ?table_name: identifier
+
+            ?column_name: identifier
+
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        args.warmup = False
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              completion=completion))
+
+    return requests
+
+
+def evaluate(ret, args):
+
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # async engine is working for 'regex', 'choice' and 'grammar'
+    if args.dataset == 'grammar':
+        args.structure_type = 'grammar'
+        args.async_engine = False
+    elif args.dataset == 'regex':
+        args.structure_type = 'regex'
+        args.async_engine = False
+    elif args.dataset == 'choice':
+        args.structure_type = 'choice'
+        args.async_engine = False
+    else:
+        args.structure_type = 'json'
+
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += f"_async{args.async_engine}"
+        result_file_name += f"_warmup{args.warmup}"
+        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    # Synthesize a prompt with the given input length.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = sample_requests(tokenizer, args)
+
+    if args.async_engine:
+        engine_args = AsyncEngineArgs.from_cli_args(args)
+        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
+            run_vllm_async(requests, engine_args, args.n,
+                           args.guided_decoding_ratio, args.warmup,
+                           args.disable_frontend_multiprocessing))
+    else:
+        engine_args = EngineArgs.from_cli_args(args)
+        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
+                                     args.guided_decoding_ratio, args.warmup)
+        first_latency, next_latency = None, None
+
+    score = evaluate(ret, args)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    if first_latency is not None:
+        latency_breakdown = "\nFirst token latency(msecs):\n"
+        latency_breakdown += f"{first_latency.describe()}"
+        latency_breakdown += "\nNext token latency(msecs):\n"
+        latency_breakdown += f"{next_latency.describe()}"
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
+        f"Correct rate is {score} %",
+        f"{latency_breakdown if first_latency is not None else ''}")
+
+    # Output JSON results if specified
+    if args.output_json or result_file_name:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "total_output_tokens": total_output_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
+            "output_tokens_per_second":
+            f"{total_output_tokens / elapsed_time:.2f}",
+            "correct_rate(%)": score
+        }
+        results = {"outputs": ret, **results}
+        if first_latency is not None:
+            results["first_token_latency(msecs)"] = first_latency.describe(
+            ).to_dict()
+            results["next_token_latency(msecs)"] = next_latency.describe(
+            ).to_dict()
+        if args.output_json:
+            with open(args.output_json, "w") as f:
+                json.dump(results, f, indent=4)
+        elif result_file_name:
+            with open(result_file_name, "w") as f:
+                json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=512,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=10,
+                        help="Number of prompts to process.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument("--warmup",
+                        action="store_true",
+                        default=False,
+                        help="Run warmup prompts before benchmark.")
+    parser.add_argument("--save-results",
+                        action="store_true",
+                        default=False,
+                        help="save output results.")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    main(args)
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
new file mode 100644
index 0000000000000..4435d87e18a8a
--- /dev/null
+++ b/benchmarks/benchmark_serving_guided.py
@@ -0,0 +1,881 @@
+r"""Benchmark online serving throughput with guided decoding.
+
+On the server side, run one of the following commands:
+    (vLLM OpenAI API server)
+    vllm serve <your_model> --disable-log-requests
+
+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset json \
+        --guided-decoding-ratio 1.0 \
+        --guided-decoding-backend xgrammar \
+        --request-rate 10 \
+        --num-prompts 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+import argparse
+import asyncio
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from typing import AsyncGenerator, List, Optional, Tuple
+
+import datasets
+import numpy as np
+import pandas as pd
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  RequestFuncOutput)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: List[Tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: List[Tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: List[Tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: List[Tuple[float, float]]
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str
+    completion: str = None
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+
+            ?select_statement: "SELECT " column_list " FROM " table_name
+
+            ?column_list: column_name ("," column_name)*
+
+            ?table_name: identifier
+
+            ?column_name: identifier
+
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              structure_type=args.structure_type,
+                              completion=completion))
+
+    return requests
+
+
+async def get_request(
+    input_requests: List[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
+    """
+    Asynchronously generates requests at a specified rate 
+    with OPTIONAL burstiness.
+    
+    Args:
+        input_requests: 
+            A list of input requests, each represented as a tuple.
+        request_rate: 
+            The rate at which requests are generated (requests/s).
+        burstiness (optional): 
+            The burstiness factor of the request generation. 
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results 
+            in more bursty requests, while a higher burstiness value 
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
+    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
+    for i, request in enumerate(input_requests):
+        yield i, request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[float],
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    actual_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    all_tpots: List[float] = []
+    ttfts: List[float] = []
+    e2els: List[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note : this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text,
+                          add_special_tokens=False).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+                                                                 1)
+                tpots.append(tpot)
+            outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+                            for p in selected_percentiles],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+                             for p in selected_percentiles],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[SampleRequest],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[str],
+    ignore_eos: bool,
+    max_concurrency: Optional[int],
+    guided_decoding_ratio: float,
+    guided_decoding_backend: str,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    def prepare_extra_body(request) -> dict:
+        extra_body = {}
+        # Add the schema to the extra_body
+        extra_body[request.structure_type] = request.schema
+        # Add the specific guided_decoding_backend
+        extra_body["guided_decoding_backend"] = guided_decoding_backend
+        return extra_body
+
+    print("Starting initial single prompt test run...")
+    guided_decoding_req_idx = random.sample(
+        range(len(input_requests)),
+        int(len(input_requests) * guided_decoding_ratio))
+
+    test_request = input_requests[0]
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_request.prompt,
+        api_url=api_url,
+        prompt_len=test_request.prompt_len,
+        output_len=test_request.expected_output_len,
+        ignore_eos=ignore_eos,
+        extra_body=prepare_extra_body(test_request),
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}")
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=prepare_extra_body(test_request),
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
+    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = (asyncio.Semaphore(max_concurrency)
+                 if max_concurrency else None)
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    expected: List[str] = []
+    async for i, request in get_request(input_requests, request_rate,
+                                        burstiness):
+        extra_body = prepare_extra_body(
+            request) if i in guided_decoding_req_idx else None
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=request.prompt,
+            api_url=api_url,
+            prompt_len=request.prompt_len,
+            output_len=request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        expected.append(request.completion)
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input,
+                                     pbar=pbar)))
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            extra_body={test_request.structure_type: test_request.schema},
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+    )
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+                                    metrics.total_token_throughput))
+
+    result = {
+        "duration":
+        benchmark_duration,
+        "completed":
+        metrics.completed,
+        "total_input_tokens":
+        metrics.total_input,
+        "total_output_tokens":
+        metrics.total_output,
+        "request_throughput":
+        metrics.request_throughput,
+        "output_throughput":
+        metrics.output_throughput,
+        "total_token_throughput":
+        metrics.total_token_throughput,
+        "ttft_description":
+        pd.Series([output.ttft for output in outputs]).describe().to_dict(),
+        "tpot_description":
+        pd.Series([output.tpot for output in outputs]).describe().to_dict(),
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens":
+        actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    ret = [{
+        'generated': output.generated_text,
+        'expected': gt
+    } for output, gt in zip(outputs, expected)]
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{:<40} {:<10.2f}".format(
+            f"Mean {metric_name} (ms):",
+            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+        print("{:<40} {:<10.2f}".format(
+            f"Median {metric_name} (ms):",
+            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics,
+                                f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+                                            value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT",
+                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result, ret
+
+
+def evaluate(ret, args):
+
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'guided_json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'guided_regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'guided_choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)
+
+    if args.dataset == 'grammar':
+        args.structure_type = 'guided_grammar'
+    elif args.dataset == 'regex':
+        args.structure_type = 'guided_regex'
+    elif args.dataset == 'choice':
+        args.structure_type = 'guided_choice'
+    else:
+        args.structure_type = 'guided_json'
+
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{backend}"
+        result_file_name += f"_{args.request_rate}qps"
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    input_requests = sample_requests(tokenizer, args)
+
+    benchmark_result, ret = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[
+                float(p) for p in args.metric_percentiles.split(",")
+            ],
+            ignore_eos=args.ignore_eos,
+            max_concurrency=args.max_concurrency,
+            guided_decoding_ratio=args.guided_decoding_ratio,
+            guided_decoding_backend=args.guided_decoding_backend,
+        ))
+
+    # Save config and results to json
+    score = evaluate(ret, args)
+    print("correct_rate(%)", score, '\n')
+    if args.save_results:
+        results = {
+            "backend":
+            backend,
+            "model_id":
+            model_id,
+            "tokenizer_id":
+            tokenizer_id,
+            "num_prompts":
+            args.num_prompts,
+            "request_rate":
+            args.request_rate if args.request_rate < float("inf") else "inf",
+            "burstiness":
+            args.burstiness,
+            "max_concurrency":
+            args.max_concurrency,
+            "correct_rate(%)":
+            score
+        }
+        results = {"outputs": ret, **results, **benchmark_result}
+
+        # Save to file
+        if args.result_filename:
+            result_file_name = args.result_filename
+        if args.result_dir:
+            result_file_name = os.path.join(args.result_dir, result_file_name)
+        with open(result_file_name, "w", encoding='utf-8') as outfile:
+            json.dump(results, outfile, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-results",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-seperated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        "Default value is \"ttft,tpot,itl\".")
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-seperated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\". "
+        "Use \"--percentile-metrics\" to select metrics.",
+    )
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--guided-decoding-backend",
+                        type=str,
+                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
+                        default="xgrammar",
+                        help="Backend to use for guided decoding")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 159cf055737ce..1e5967bd9bf8b 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -294,23 +294,36 @@ def main(args: argparse.Namespace):
     tokenizer = AutoTokenizer.from_pretrained(
         args.tokenizer, trust_remote_code=args.trust_remote_code)
     if args.dataset is None:
-        # Synthesize a prompt with the given input length.
-        # As tokenizer may add additional tokens like BOS, we need to try
-        # different lengths to get the desired input length.
-        for i in range(-10, 10):
-            prompt = "hi " * (args.input_len + i)
-            tokenized_prompt = tokenizer(prompt).input_ids
-            if len(tokenized_prompt) == args.input_len:
-                break
-        else:
-            raise ValueError(
-                f"Failed to synthesize a prompt with {args.input_len} tokens.")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=args.input_len,
-                          expected_output_len=args.output_len)
-            for _ in range(args.num_prompts)
-        ]
+        vocab_size = tokenizer.vocab_size
+        requests = []
+        for _ in range(args.num_prompts):
+            # Synthesize a prompt with the given input length.
+            candidate_ids = [
+                random.randint(0, vocab_size - 1)
+                for _ in range(args.input_len)
+            ]
+            # As tokenizer may add additional tokens like BOS, we need to try
+            # different lengths to get the desired input length.
+            for _ in range(5):  # Max attempts to correct
+                candidate_prompt = tokenizer.decode(candidate_ids)
+                tokenized_len = len(tokenizer.encode(candidate_prompt))
+
+                if tokenized_len == args.input_len:
+                    break
+
+                # Adjust length based on difference
+                diff = args.input_len - tokenized_len
+                if diff > 0:
+                    candidate_ids.extend([
+                        random.randint(100, vocab_size - 100)
+                        for _ in range(diff)
+                    ])
+                else:
+                    candidate_ids = candidate_ids[:diff]
+            requests.append(
+                SampleRequest(prompt=candidate_prompt,
+                              prompt_len=args.input_len,
+                              expected_output_len=args.output_len))
     else:
         requests = sample_requests(tokenizer, args)
 
diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json
new file mode 100644
index 0000000000000..6003698469e8d
--- /dev/null
+++ b/benchmarks/structured_schemas/structured_schema_1.json
@@ -0,0 +1,113 @@
+{
+    "$schema":
+    "https://json-schema.org/draft/2020-12/schema",
+    "title":
+    "User Profile",
+    "type":
+    "object",
+    "properties": {
+        "userId": {
+            "type": "string",
+            "description": "Unique identifier for the user."
+        },
+        "personalInfo": {
+            "type": "object",
+            "properties": {
+                "firstName": {
+                    "type": "string",
+                    "description": "The user's first name."
+                },
+                "lastName": {
+                    "type": "string",
+                    "description": "The user's last name."
+                },
+                "age": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "The user's age."
+                },
+                "phoneNumbers": {
+                    "type":
+                    "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "type": {
+                                "type": "string",
+                                "enum": ["home", "work", "mobile"],
+                                "description": "Type of phone number."
+                            },
+                            "number": {
+                                "type": "string",
+                                "pattern": "^\\+?[1-9]\\d{1,14}$",
+                                "description": "Phone number in E.164 format."
+                            }
+                        },
+                        "required": ["type", "number"]
+                    },
+                    "description":
+                    "List of phone numbers associated with the user."
+                }
+            },
+            "required": ["firstName", "lastName"]
+        },
+        "address": {
+            "type": "object",
+            "properties": {
+                "street": {
+                    "type": "string",
+                    "description": "Street address."
+                },
+                "city": {
+                    "type": "string",
+                    "description": "City name."
+                },
+                "state": {
+                    "type": "string",
+                    "description": "State or province."
+                },
+                "postalCode": {
+                    "type": "string",
+                    "pattern": "^\\d{5}(-\\d{4})?$",
+                    "description": "Postal code."
+                },
+                "country": {
+                    "type": "string",
+                    "description": "Country name."
+                }
+            },
+            "required": ["street", "city", "state", "postalCode", "country"]
+        },
+        "preferences": {
+            "type": "object",
+            "properties": {
+                "newsletterSubscribed": {
+                    "type":
+                    "boolean",
+                    "description":
+                    "Indicates if the user is subscribed to the newsletter."
+                },
+                "favoriteCategories": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "List of user's favorite categories."
+                }
+            },
+            "required": ["newsletterSubscribed"]
+        },
+        "accountStatus": {
+            "type": "string",
+            "enum": ["active", "inactive", "suspended"],
+            "description": "Current status of the user's account."
+        },
+        "registrationDate": {
+            "type": "string",
+            "format": "date-time",
+            "description": "ISO 8601 formatted date-time of user registration."
+        }
+    },
+    "required":
+    ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
+}
\ No newline at end of file
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 1be806bbfa43c..8a95279f9a25a 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -307,10 +307,20 @@ void reshape_and_cache_flash(
     torch::Tensor& key_cache,  // [num_blocks, block_size, num_heads, head_size]
     torch::Tensor&
         value_cache,  // [num_blocks, block_size, num_heads, head_size]
-    torch::Tensor& slot_mapping,  // [num_tokens]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
     const std::string& kv_cache_dtype, const double k_scale,
     const double v_scale) {
-  int num_tokens = key.size(0);
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
   int block_size = key_cache.size(1);
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index 498d069c05f0d..dd1e6de2e0180 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -424,7 +424,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
         // (which occurs when `final_state_position` is a non-positivie index)
         // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
-        if (final_state_position < 0 && seqlen > kWidth){
+        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
             input_t vals_load[kNElts] = {0};
             if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
                 // chunk = n_chunks - 2, a segment of the final state sits in the last index
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 8ea240f59c38f..5c80645b405ae 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -16,4 +16,5 @@ mistral_common >= 1.5.0
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
+partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+requests
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 96ad9a4c26b09..e9d9ac68c9560 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -10,11 +10,13 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
+import inspect
 import logging
 import os
 import sys
 from typing import List
 
+import requests
 from sphinx.ext import autodoc
 
 logger = logging.getLogger(__name__)
@@ -34,6 +36,7 @@
 extensions = [
     "sphinx.ext.napoleon",
     "sphinx.ext.viewcode",
+    "sphinx.ext.linkcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",
     "sphinx.ext.autodoc",
@@ -94,6 +97,69 @@ def setup(app):
     generate_examples()
 
 
+_cached_base: str = ""
+_cached_branch: str = ""
+
+
+def get_repo_base_and_branch(pr_number):
+    global _cached_base, _cached_branch
+    if _cached_base and _cached_branch:
+        return _cached_base, _cached_branch
+
+    url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
+    response = requests.get(url)
+    if response.status_code == 200:
+        data = response.json()
+        _cached_base = data['head']['repo']['full_name']
+        _cached_branch = data['head']['ref']
+        return _cached_base, _cached_branch
+    else:
+        logger.error("Failed to fetch PR details: %s", response)
+        return None, None
+
+
+def linkcode_resolve(domain, info):
+    if domain != 'py':
+        return None
+    if not info['module']:
+        return None
+    filename = info['module'].replace('.', '/')
+    module = info['module']
+
+    # try to determine the correct file and line number to link to
+    obj = sys.modules[module]
+
+    # get as specific as we can
+    lineno: int = 0
+    filename: str = ""
+    try:
+        for part in info['fullname'].split('.'):
+            obj = getattr(obj, part)
+
+            if not (inspect.isclass(obj) or inspect.isfunction(obj)
+                    or inspect.ismethod(obj)):
+                obj = obj.__class__  # Get the class of the instance
+
+            lineno = inspect.getsourcelines(obj)[1]
+            filename = (inspect.getsourcefile(obj)
+                        or f"{filename}.py").split("vllm/", 1)[1]
+    except Exception:
+        # For some things, like a class member, won't work, so
+        # we'll use the line number of the parent (the class)
+        pass
+
+    if filename.startswith("checkouts/"):
+        # a PR build on readthedocs
+        pr_number = filename.split("/")[1]
+        filename = filename.split("/", 2)[2]
+        base, branch = get_repo_base_and_branch(pr_number)
+        if base and branch:
+            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
+
+    # Otherwise, link to the source file on the main branch
+    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
+
+
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
     "compressed_tensors",
@@ -112,6 +178,7 @@ def setup(app):
     "tensorizer",
     "pynvml",
     "outlines",
+    "xgrammar,"
     "librosa",
     "soundfile",
     "gguf",
diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst
index 30f543abc20c7..c6d47f90b62d5 100644
--- a/docs/source/design/multimodal/multimodal_index.rst
+++ b/docs/source/design/multimodal/multimodal_index.rst
@@ -7,7 +7,7 @@ Multi-Modality
     
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
-Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
+Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_mm_models>`
 via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
 
 Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
@@ -15,9 +15,6 @@ by following :ref:`this guide <adding_multimodal_plugin>`.
 
 Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
 
-..
-  TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported
-
 Guides
 ++++++
 
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index e3dbbc9affe66..9b6cb0e80d60e 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -21,7 +21,7 @@ You can install vLLM using pip:
 .. code-block:: console
 
     $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.10 -y
+    $ conda create -n myenv python=3.12 -y
     $ conda activate myenv
 
     $ # Install vLLM with CUDA 12.1.
@@ -73,7 +73,7 @@ Another way to access the latest code is to use the docker images:
 .. code-block:: console
 
     $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}
+    $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
 
 These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
 
@@ -89,45 +89,24 @@ Build from source
 Python-only build (without compilation)
 ---------------------------------------
 
-If you only need to change Python code, you can simply build vLLM without compilation.
-
-The first step is to install the latest vLLM wheel:
-
-.. code-block:: console
-
-    pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
-
-After verifying that the installation is successful, you can use `the following script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_:
+If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag <https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs>`_, changes you make to the code will be reflected when you run vLLM:
 
 .. code-block:: console
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
-    $ python python_only_dev.py
+    $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 
-The script will:
+This will download the latest nightly wheel and use the compiled libraries from there in the install.
 
-* Find the installed vLLM package in the current environment.
-* Copy built files to the current directory.
-* Rename the installed vLLM package.
-* Symbolically link the current directory to the installed vLLM package.
-
-Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM.
-
-Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_ with the ``--quit-dev`` (or ``-q`` for short) flag:
+The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel <https://pypi.org/project/vllm/#files>`_:
 
 .. code-block:: console
 
-    $ python python_only_dev.py --quit-dev
-
-The ``--quit-dev`` flag will:
-
-* Remove the symbolic link from the current directory to the vLLM package.
-* Restore the original vLLM package from the backup.
+   $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+   $ pip install --editable .
 
-If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again.
+You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
 
 .. note::
 
@@ -148,9 +127,13 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 .. tip::
 
     Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+
     For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
     As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
+    `sccache <https://github.com/mozilla/sccache>`_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
+    The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``.
+
 
 Use an existing PyTorch installation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0692e949f1c77..86b1eed2d26ba 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -85,12 +85,8 @@ Documentation
    serving/deploying_with_nginx
    serving/distributed_serving
    serving/metrics
-   serving/env_vars
-   serving/usage_stats
    serving/integrations
    serving/tensorizer
-   serving/compatibility_matrix
-   serving/faq
 
 .. toctree::
    :maxdepth: 1
@@ -99,12 +95,21 @@ Documentation
    models/supported_models
    models/adding_model
    models/enabling_multimodal_inputs
-   models/engine_args
-   models/lora
-   models/vlm
-   models/structured_outputs
-   models/spec_decode
-   models/performance
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Usage
+
+   usage/lora
+   usage/multimodal_inputs
+   usage/structured_outputs
+   usage/spec_decode
+   usage/compatibility_matrix
+   usage/performance
+   usage/faq
+   usage/engine_args
+   usage/env_vars
+   usage/usage_stats
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst
index 49b5285c45590..5c1236e1a8972 100644
--- a/docs/source/models/enabling_multimodal_inputs.rst
+++ b/docs/source/models/enabling_multimodal_inputs.rst
@@ -3,7 +3,7 @@
 Enabling Multimodal Inputs
 ==========================
 
-This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal <multi_modality>` inputs.
+This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs <multimodal_inputs>`.
 
 .. seealso::
     :ref:`adding_a_new_model`
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 9f3b6f59068e2..4e5b10967e3bb 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -471,6 +471,8 @@ Sentence Pair Scoring
 .. note::
     These models are supported in both offline and online inference via Score API.
 
+.. _supported_mm_models:
+
 Multimodal Language Models
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -489,13 +491,11 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive.
 
 - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
-.. _supported_vlms:
-
 Text Generation
 ---------------
 
 .. list-table::
-  :widths: 25 25 15 25 5 5
+  :widths: 25 25 15 20 5 5 5
   :header-rows: 1
 
   * - Architecture
@@ -504,151 +504,194 @@ Text Generation
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+    - V1
   * - :code:`AriaForConditionalGeneration`
     - Aria
     - T + I
     - :code:`rhymes-ai/Aria`
     - 
     - ✅︎
+    - 
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
     - T + I\ :sup:`E`
     - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
     -
     - ✅︎
+    - 
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - T + I
     - :code:`facebook/chameleon-7b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`FuyuForCausalLM`
     - Fuyu
     - T + I
     - :code:`adept/fuyu-8b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`ChatGLMModel`
     - GLM-4V
     - T + I
     - :code:`THUDM/glm-4v-9b` etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`H2OVLChatModel`
     - H2OVL
     - T + I\ :sup:`E+`
     - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Idefics3ForConditionalGeneration`
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
     - ✅︎
+    -
     - 
   * - :code:`InternVLChatModel`
-    - InternVL2
+    - InternVL 2.5, Mono-InternVL, InternVL 2.0
     - T + I\ :sup:`E+`
-    - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
+    - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
+    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT
     - T + I\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
     - T + V
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
     - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - T + I\ :sup:`E+`
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
     - T + I\ :sup:`+`
     - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
     -
     -
+    -
   * - :code:`MolmoForCausalLM`
     - Molmo
     - T + I
     - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`NVLM_D_Model`
     - NVLM-D 1.0
     - T + I\ :sup:`E+`
     - :code:`nvidia/NVLM-D-72B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
     - PaliGemma
     - T + I\ :sup:`E`
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision, Phi-3.5-Vision
     - T + I\ :sup:`E+`
     - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`PixtralForConditionalGeneration`
     - Pixtral
     - T + I\ :sup:`+`
     - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`QWenLMHeadModel`
     - Qwen-VL
     - T + I\ :sup:`E+`
     - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
     - ✅︎
     - ✅︎
+    -
   * - :code:`Qwen2AudioForConditionalGeneration`
     - Qwen2-Audio
     - T + A\ :sup:`+`
     - :code:`Qwen/Qwen2-Audio-7B-Instruct`
     -
     - ✅︎
+    - 
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`UltravoxModel`
     - Ultravox
     - T + A\ :sup:`E+`
     - :code:`fixie-ai/ultravox-v0_3`
     -
     - ✅︎
+    - 
 
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.
 
+.. important::
+    To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference)
+    or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+
+    .. code-block:: python
+
+        llm = LLM(
+            model="Qwen/Qwen2-VL-7B-Instruct",
+            limit_mm_per_prompt={"image": 4},
+        )
+
+    .. code-block:: bash
+
+        vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+
 .. note::
   vLLM currently only supports adding LoRA to the language backbone of multimodal models.
 
+.. note::
+  To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
+  and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+
 .. note::
   The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst
new file mode 100644
index 0000000000000..ec3c065320fd9
--- /dev/null
+++ b/docs/source/serving/deploying_with_kubeai.rst
@@ -0,0 +1,17 @@
+.. _deploying_with_kubeai:
+
+Deploying with KubeAI
+=====================
+
+`KubeAI <https://github.com/substratusai/kubeai>`_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
+
+
+Please see the Installation Guides for environment specific instructions:
+
+* `Any Kubernetes Cluster <https://www.kubeai.org/installation/any/>`_
+* `EKS <https://www.kubeai.org/installation/eks/>`_
+* `GKE <https://www.kubeai.org/installation/gke/>`_
+
+Once you have KubeAI installed, you can
+`configure text generation models <https://www.kubeai.org/how-to/configure-text-generation-models/>`_
+using vLLM.
\ No newline at end of file
diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
index f39997e0e44d9..0dd505a739863 100644
--- a/docs/source/serving/integrations.rst
+++ b/docs/source/serving/integrations.rst
@@ -6,6 +6,7 @@ Integrations
 
    run_on_sky
    deploying_with_kserve
+   deploying_with_kubeai
    deploying_with_triton
    deploying_with_bentoml
    deploying_with_cerebrium
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index c39cef85897ed..d75e90807ca1d 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -32,7 +32,7 @@ We currently support the following OpenAI APIs:
 - [Completions API](https://platform.openai.com/docs/api-reference/completions)
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst).
+  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
     - *Note: `image_url.detail` parameter is not supported.*
   - We also support `audio_url` content type for audio files.
     - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
@@ -41,7 +41,7 @@ We currently support the following OpenAI APIs:
 - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
   - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
     which will be treated as a single prompt to the model according to its chat template.
-    - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst).
+    - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
   - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
 
 ## Score API for Cross Encoder Models
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst
similarity index 100%
rename from docs/source/serving/compatibility_matrix.rst
rename to docs/source/usage/compatibility_matrix.rst
diff --git a/docs/source/models/engine_args.rst b/docs/source/usage/engine_args.rst
similarity index 100%
rename from docs/source/models/engine_args.rst
rename to docs/source/usage/engine_args.rst
diff --git a/docs/source/serving/env_vars.rst b/docs/source/usage/env_vars.rst
similarity index 100%
rename from docs/source/serving/env_vars.rst
rename to docs/source/usage/env_vars.rst
diff --git a/docs/source/serving/faq.rst b/docs/source/usage/faq.rst
similarity index 99%
rename from docs/source/serving/faq.rst
rename to docs/source/usage/faq.rst
index 9e858e612c8bf..ce327abd5fa20 100644
--- a/docs/source/serving/faq.rst
+++ b/docs/source/usage/faq.rst
@@ -1,3 +1,5 @@
+.. _faq:
+
 Frequently Asked Questions
 ===========================
 
diff --git a/docs/source/models/lora.rst b/docs/source/usage/lora.rst
similarity index 99%
rename from docs/source/models/lora.rst
rename to docs/source/usage/lora.rst
index ef0177eaf2162..c2c6fa2aebfaf 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/usage/lora.rst
@@ -1,7 +1,7 @@
 .. _lora:
 
-Using LoRA adapters
-===================
+LoRA Adapters
+=============
 
 This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
 
diff --git a/docs/source/models/vlm.rst b/docs/source/usage/multimodal_inputs.rst
similarity index 62%
rename from docs/source/models/vlm.rst
rename to docs/source/usage/multimodal_inputs.rst
index bcbe50a25fa09..c93f65327e31b 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -1,34 +1,31 @@
-.. _vlm:
+.. _multimodal_inputs:
 
-Using VLMs
-==========
+Multimodal Inputs
+=================
 
-vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here <supported_vlms>`.
-This document shows you how to run and serve these models using vLLM.
+This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models <supported_mm_models>` in vLLM.
 
 .. note::
-    We are actively iterating on VLM support. See `this RFC <https://github.com/vllm-project/vllm/issues/4194>`_ for upcoming changes,
+    We are actively iterating on multi-modal support. See `this RFC <https://github.com/vllm-project/vllm/issues/4194>`_ for upcoming changes,
     and `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
 Offline Inference
 -----------------
 
-Single-image input
-^^^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models.
-
-.. code-block:: python
-
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`:
+To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`:
 
 * ``prompt``: The prompt should follow the format that is documented on HuggingFace.
 * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
 
+Image
+^^^^^
+
+You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples:
+
 .. code-block:: python
 
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
     # Refer to the HuggingFace repo for the correct format to use
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
@@ -41,41 +38,6 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
         "multi_modal_data": {"image": image},
     })
 
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-    # Inference with image embeddings as input
-    image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": image_embeds},
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-    # Inference with image embeddings as input with additional parameters
-    # Specifically, we are conducting a trial run of Qwen2VL and MiniCPM-V with the new input format, which utilizes additional parameters.
-    mm_data = {}
-
-    image_embeds = torch.load(...) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-    # For Qwen2VL, image_grid_thw is needed to calculate positional encoding.
-    mm_data['image'] = {
-        "image_embeds": image_embeds,
-        "image_grid_thw": torch.load(...) # torch.Tensor of shape (1, 3),
-    }
-    # For MiniCPM-V, image_size_list is needed to calculate details of the sliced image.
-    mm_data['image'] = {
-        "image_embeds": image_embeds,
-        "image_size_list": [image.size] # list of image sizes
-    }
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": mm_data,
-    })
-
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
@@ -102,12 +64,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
 
 A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 
-Multi-image input
-^^^^^^^^^^^^^^^^^
-
-Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
-
-To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class.
+To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
 .. code-block:: python
 
@@ -118,10 +75,6 @@ To enable multiple multi-modal items per text prompt, you have to set ``limit_mm
         limit_mm_per_prompt={"image": 2},  # The maximum number to accept
     )
 
-Instead of passing in a single image, you can pass in a list of images.
-
-.. code-block:: python
-
     # Refer to the HuggingFace repo for the correct format to use
     prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
 
@@ -169,30 +122,114 @@ Multi-image input can be extended to perform video captioning. We show this with
         generated_text = o.outputs[0].text
         print(generated_text)
 
+Video
+^^^^^
+
+You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary
+instead of using multi-image input.
+
+Please refer to `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_ for more details.
+
+Audio
+^^^^^
+
+You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary.
+
+Please refer to `examples/offline_inference_audio_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py>`_ for more details.
+
+Embedding
+^^^^^^^^^
+
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+
+.. code-block:: python
+
+    # Inference with image embeddings as input
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+    # Embeddings for single image
+    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
+
+.. code-block:: python
+
+    # Construct the prompt based on your model
+    prompt = ...
+
+    # Embeddings for multiple images
+    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    # Qwen2-VL
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_grid_thw is needed to calculate positional encoding.
+            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+        }
+    }
+
+    # MiniCPM-V
+    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_size_list is needed to calculate details of the sliced image.
+            "image_size_list": [image.size for image in images],  # list of image sizes
+        }
+    }
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
 Online Inference
 ----------------
 
-OpenAI Vision API
-^^^^^^^^^^^^^^^^^
+Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_.
+
+.. important::
+    A chat template is **required** to use Chat Completions API.
+
+    Although most models come with a chat template, for others you have to define one yourself.
+    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
+    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`__.
+
+Image
+^^^^^
 
-You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
+Image input is supported according to `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
+Here is a simple example using Phi-3.5-Vision.
 
-Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server.
+First, launch the OpenAI-compatible server:
 
 .. code-block:: bash
 
     vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
       --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
-.. important::
-    Since OpenAI Vision API is based on `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_,
-    a chat template is **required** to launch the API server.
-
-    Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
-    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
-    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
-
-To consume the server, you can use the OpenAI client like in the example below:
+Then, you can use the OpenAI client as follows:
 
 .. code-block:: python
 
@@ -252,22 +289,59 @@ A full code example can be found in `examples/openai_chat_completion_client_for_
 
 .. note::
 
-    By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
+    By default, the timeout for fetching images through HTTP URL is ``5`` seconds.
+    You can override this by setting the environment variable:
 
     .. code-block:: console
 
         $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
-Chat Embeddings API
-^^^^^^^^^^^^^^^^^^^
+Video
+^^^^^
+
+Instead of :code:`image_url`, you can pass a video file via :code:`video_url`.
+
+You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py>`_ as reference.
+
+.. note::
+
+    By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds.
+    You can override this by setting the environment variable:
+
+    .. code-block:: console
+
+        $ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 
-vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
-where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
+Audio
+^^^^^
+
+Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`.
+
+A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
+
+.. note::
+
+    By default, the timeout for fetching audios through HTTP URL is ``10`` seconds.
+    You can override this by setting the environment variable:
+
+    .. code-block:: console
+
+        $ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+
+Embedding
+^^^^^^^^^
+
+vLLM's Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
+where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
 
 .. tip::
     The schema of ``messages`` is exactly the same as in Chat Completions API.
+    You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
 
-In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
+Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
+Refer to the examples below for illustration.
+
+Here is an end-to-end example using VLM2Vec. To serve the model:
 
 .. code-block:: bash
 
@@ -279,10 +353,8 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
     Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
     to run this model in embedding mode instead of text generation mode.
 
-.. important::
-
-    VLM2Vec does not expect chat-based input. We use a `custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`_
-    to combine the text and images together.
+    The custom chat template is completely different from the original one for this model,
+    and can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`__.
 
 Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
 
@@ -310,7 +382,7 @@ Since the request schema is not defined by OpenAI client, we post a request to t
     response_json = response.json()
     print("Embedding output:", response_json["data"][0]["embedding"])
 
-Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
+Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
 
 .. code-block:: bash
 
@@ -319,8 +391,10 @@ Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
 
 .. important::
 
-    Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, 
-    which is handled by the jinja template.
+    Like with VLM2Vec, we have to explicitly pass ``--task embedding``.
+    
+    Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled
+    by `this custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja>`__.
 
 .. important::
 
diff --git a/docs/source/models/performance.rst b/docs/source/usage/performance.rst
similarity index 100%
rename from docs/source/models/performance.rst
rename to docs/source/usage/performance.rst
diff --git a/docs/source/models/spec_decode.rst b/docs/source/usage/spec_decode.rst
similarity index 97%
rename from docs/source/models/spec_decode.rst
rename to docs/source/usage/spec_decode.rst
index d57ffec53215d..f1f1917f974bb 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/usage/spec_decode.rst
@@ -1,13 +1,16 @@
 .. _spec_decode:
 
-Speculative decoding in vLLM
-============================
+Speculative decoding
+====================
 
 .. warning::
     Please note that speculative decoding in vLLM is not yet optimized and does
     not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
     to optimize it is ongoing and can be followed in `this issue. <https://github.com/vllm-project/vllm/issues/4630>`_
 
+.. warning::
+    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
+
 This document shows how to use `Speculative Decoding <https://x.com/karpathy/status/1697318534555336961>`_ with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
 
@@ -182,7 +185,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
    same request across runs. For more details, see the FAQ section 
-   titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
+   titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
 
 
 **Conclusion**
@@ -197,7 +200,7 @@ can occur due to following factors:
 
 **Mitigation Strategies**
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
 
 Resources for vLLM contributors
 -------------------------------
diff --git a/docs/source/models/structured_outputs.rst b/docs/source/usage/structured_outputs.rst
similarity index 100%
rename from docs/source/models/structured_outputs.rst
rename to docs/source/usage/structured_outputs.rst
diff --git a/docs/source/serving/usage_stats.md b/docs/source/usage/usage_stats.md
similarity index 100%
rename from docs/source/serving/usage_stats.md
rename to docs/source/usage/usage_stats.md
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index f08f22eec164a..c6a274ee5894b 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -223,7 +223,7 @@ def run_internvl(question: str, modality: str):
     # Stop tokens for InternVL
     # models variants may have different stop tokens
     # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
     return llm, prompt, stop_token_ids
@@ -419,6 +419,22 @@ def run_aria(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Mantis
+def run_mantis(question: str, modality: str):
+    assert modality == "image"
+
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
+
+    llm = LLM(
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
+        max_model_len=4096,
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+    )
+    stop_token_ids = [128009]
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -441,6 +457,7 @@ def run_aria(question: str, modality: str):
     "glm4v": run_glm4v,
     "idefics3": run_idefics3,
     "aria": run_aria,
+    "mantis": run_mantis,
 }
 
 
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 788b604cfd4a0..928bbef54eab7 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -165,7 +165,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     # Stop tokens for InternVL
     # models variants may have different stop tokens
     # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
diff --git a/examples/tool_chat_template_llama3.2_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja
index 39f902c1c3c40..2b290c0eede03 100644
--- a/examples/tool_chat_template_llama3.2_json.jinja
+++ b/examples/tool_chat_template_llama3.2_json.jinja
@@ -26,13 +26,11 @@
     {%- endfor %}
 {%- endfor %}
 
-
 {#- This block extracts the system message, so we can slot it into the right place. #}
 {%- if messages[0]['role'] == 'system' %}
     {%- if messages[0]['content'] is string %}
         {%- set system_message = messages[0]['content']|trim %}
     {%- else %}
-        {#- Support vLLM's transforming of a content string to JSON. #}
         {%- set system_message = messages[0]['content'][0]['text']|trim %}
     {%- endif %}
     {%- set messages = messages[1:] %}
@@ -44,14 +42,8 @@
     {%- endif %}
 {%- endif %}
 
-{#- Including an image is not compatible with a system message #}
-{%- if image_ns.has_images and not system_message == "" %}
-    {{- raise_exception("Prompting with images is incompatible with system messages and tool use.") }}
-{%- endif %}
-
-
-{#- System message, if there are no images #}
-{%- if not image_ns.has_images %}
+{#- System message if there are no images, if the user supplied one, or if tools are used (default tool system message) #}
+{%- if system_message or not image_ns.has_images %}
     {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
     {%- if tools is not none %}
         {{- "Environment: ipython\n" }}
diff --git a/python_only_dev.py b/python_only_dev.py
index 1ca0f5c30b741..f70b4984025b3 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -1,92 +1,14 @@
-# enable python only development
-# copy compiled files to the current directory directly
+msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
 
-import argparse
-import os
-import shutil
-import subprocess
-import sys
-import warnings
+TL;DR:
 
-parser = argparse.ArgumentParser(
-    description="Development mode for python-only code")
-parser.add_argument('-q',
-                    '--quit-dev',
-                    action='store_true',
-                    help='Set the flag to quit development mode')
-args = parser.parse_args()
+VLLM_USE_PRECOMPILED=1 pip install -e .
 
-# cannot directly `import vllm` , because it will try to
-# import from the current directory
-output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"],
-                        capture_output=True)
+or
 
-assert output.returncode == 0, "vllm is not installed"
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+pip install -e .
+""" # noqa
 
-text = output.stdout.decode("utf-8")
-
-package_path = None
-for line in text.split("\n"):
-    if line.startswith("Location: "):
-        package_path = line.split(": ")[1]
-        break
-
-assert package_path is not None, "could not find package path"
-
-cwd = os.getcwd()
-
-assert cwd != package_path, "should not import from the current directory"
-
-files_to_copy = [
-    "vllm/_C.abi3.so",
-    "vllm/_moe_C.abi3.so",
-    "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
-    "vllm/vllm_flash_attn/flash_attn_interface.py",
-    "vllm/vllm_flash_attn/__init__.py",
-    # "vllm/_version.py", # not available in nightly wheels yet
-]
-
-# Try to create _version.py to avoid version related warning
-# Refer to https://github.com/vllm-project/vllm/pull/8771
-try:
-    from setuptools_scm import get_version
-    get_version(write_to="vllm/_version.py")
-except ImportError:
-    warnings.warn(
-        "To avoid warnings related to vllm._version, "
-        "you should install setuptools-scm by `pip install setuptools-scm`",
-        stacklevel=2)
-
-if not args.quit_dev:
-    for file in files_to_copy:
-        src = os.path.join(package_path, file)
-        dst = file
-        print(f"Copying {src} to {dst}")
-        shutil.copyfile(src, dst)
-
-    pre_built_vllm_path = os.path.join(package_path, "vllm")
-    tmp_path = os.path.join(package_path, "vllm_pre_built")
-    current_vllm_path = os.path.join(cwd, "vllm")
-
-    print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup")
-    shutil.copytree(pre_built_vllm_path, tmp_path)
-    shutil.rmtree(pre_built_vllm_path)
-
-    print(f"Linking {current_vllm_path} to {pre_built_vllm_path}")
-    os.symlink(current_vllm_path, pre_built_vllm_path)
-else:
-    vllm_symlink_path = os.path.join(package_path, "vllm")
-    vllm_backup_path = os.path.join(package_path, "vllm_pre_built")
-    current_vllm_path = os.path.join(cwd, "vllm")
-
-    print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}")
-    assert os.path.islink(
-        vllm_symlink_path
-    ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link"
-    assert current_vllm_path == os.readlink(
-        vllm_symlink_path
-    ), "current directory is not the source code of package"
-    os.unlink(vllm_symlink_path)
-
-    print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}")
-    os.rename(vllm_backup_path, vllm_symlink_path)
+print(msg)
diff --git a/requirements-common.txt b/requirements-common.txt
index 02e3d65fb774c..112528880c0ac 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,6 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines >= 0.0.43, < 0.1
+xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 4674efb812cfd..17d40d0ee131a 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f
diff --git a/requirements-test.in b/requirements-test.in
index 44972866ddc4b..c0b228148ab31 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -24,9 +24,6 @@ mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
-# TODO: Add this after fully implementing llava(mantis)
-# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
-
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9
diff --git a/requirements-test.txt b/requirements-test.txt
index a59b85023948b..38a064bca449a 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,8 +1,8 @@
 #
-# This file is autogenerated by pip-compile with Python 3.9
+# This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile requirements-test.in
+#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
 #
 absl-py==2.1.0
     # via rouge-score
@@ -27,10 +27,6 @@ anyio==4.6.2.post1
     # via httpx
 argcomplete==3.5.1
     # via datamodel-code-generator
-async-timeout==4.0.3
-    # via
-    #   aiohttp
-    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -111,10 +107,6 @@ email-validator==2.2.0
     # via pydantic
 evaluate==0.4.3
     # via lm-eval
-exceptiongroup==1.2.2
-    # via
-    #   anyio
-    #   pytest
 fastrlock==0.8.2
     # via cupy-cuda12x
 filelock==3.16.1
@@ -165,8 +157,6 @@ idna==3.10
     #   httpx
     #   requests
     #   yarl
-importlib-resources==6.4.5
-    # via matplotlib
 inflect==5.6.2
     # via datamodel-code-generator
 iniconfig==2.0.0
@@ -518,12 +508,6 @@ timm==1.0.11
     # via -r requirements-test.in
 tokenizers==0.20.3
     # via transformers
-toml==0.10.2
-    # via datamodel-code-generator
-tomli==2.0.2
-    # via
-    #   black
-    #   pytest
 torch==2.5.1
     # via
     #   -r requirements-test.in
@@ -550,7 +534,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.45.2
+transformers==4.46.3
     # via
     #   lm-eval
     #   peft
@@ -567,12 +551,9 @@ typepy[datetime]==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
-    #   anyio
-    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
-    #   multidict
     #   pydantic
     #   pydantic-core
     #   torch
@@ -590,8 +571,6 @@ xxhash==3.5.0
     #   evaluate
 yarl==1.17.1
     # via aiohttp
-zipp==3.20.2
-    # via importlib-resources
 zstandard==0.23.0
     # via lm-eval
 
diff --git a/setup.py b/setup.py
index 9d4bae6dd265e..6b1b66e41fb11 100644
--- a/setup.py
+++ b/setup.py
@@ -249,6 +249,74 @@ def run(self):
             self.copy_file(file, dst_file)
 
 
+class repackage_wheel(build_ext):
+    """Extracts libraries and other files from an existing wheel."""
+    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+
+    def run(self) -> None:
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
+                                   self.default_wheel)
+
+        assert _is_cuda(
+        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
+        import zipfile
+
+        if os.path.isfile(wheel_location):
+            wheel_path = wheel_location
+            print(f"Using existing wheel={wheel_path}")
+        else:
+            # Download the wheel from a given URL, assume
+            # the filename is the last part of the URL
+            wheel_filename = wheel_location.split("/")[-1]
+
+            import tempfile
+
+            # create a temporary directory to store the wheel
+            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
+            wheel_path = os.path.join(temp_dir, wheel_filename)
+
+            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
+
+            from urllib.request import urlretrieve
+
+            try:
+                urlretrieve(wheel_location, filename=wheel_path)
+            except Exception as e:
+                from setuptools.errors import SetupError
+
+                raise SetupError(
+                    f"Failed to get vLLM wheel from {wheel_location}") from e
+
+        with zipfile.ZipFile(wheel_path) as wheel:
+            files_to_copy = [
+                "vllm/_C.abi3.so",
+                "vllm/_moe_C.abi3.so",
+                "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+                "vllm/vllm_flash_attn/flash_attn_interface.py",
+                "vllm/vllm_flash_attn/__init__.py",
+                # "vllm/_version.py", # not available in nightly wheels yet
+            ]
+            file_members = filter(lambda x: x.filename in files_to_copy,
+                                  wheel.filelist)
+
+            for file in file_members:
+                print(f"Extracting and including {file.filename} "
+                      "from existing wheel")
+                package_name = os.path.dirname(file.filename).replace("/", ".")
+                file_name = os.path.basename(file.filename)
+
+                if package_name not in package_data:
+                    package_data[package_name] = []
+
+                wheel.extract(file)
+                if file_name.endswith(".py"):
+                    # python files shouldn't be added to package_data
+                    continue
+
+                package_data[package_name].append(file_name)
+
+
 def _is_hpu() -> bool:
     is_hpu_available = True
     try:
@@ -397,12 +465,15 @@ def get_vllm_version() -> str:
         if envs.VLLM_TARGET_DEVICE == "empty":
             version += f"{sep}empty"
     elif _is_cuda():
-        cuda_version = str(get_nvcc_cuda_version())
-        if cuda_version != MAIN_CUDA_VERSION:
-            cuda_version_str = cuda_version.replace(".", "")[:3]
-            # skip this for source tarball, required for pypi
-            if "sdist" not in sys.argv:
-                version += f"{sep}cu{cuda_version_str}"
+        if envs.VLLM_USE_PRECOMPILED:
+            version += ".precompiled"
+        else:
+            cuda_version = str(get_nvcc_cuda_version())
+            if cuda_version != MAIN_CUDA_VERSION:
+                cuda_version_str = cuda_version.replace(".", "")[:3]
+                # skip this for source tarball, required for pypi
+                if "sdist" not in sys.argv:
+                    version += f"{sep}cu{cuda_version_str}"
     elif _is_hip():
         # Get the HIP version
         hipcc_version = get_hipcc_rocm_version()
@@ -515,13 +586,18 @@ def _read_requirements(filename: str) -> List[str]:
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
-if envs.VLLM_USE_PRECOMPILED:
-    ext_modules = []
-    package_data["vllm"].append("*.so")
 
 if _no_device():
     ext_modules = []
 
+if not ext_modules:
+    cmdclass = {}
+else:
+    cmdclass = {
+        "build_ext":
+        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+    }
+
 setup(
     name="vllm",
     version=get_vllm_version(),
@@ -558,7 +634,7 @@ def _read_requirements(filename: str) -> List[str]:
         "audio": ["librosa", "soundfile"],  # Required for audio processing
         "video": ["decord"]  # Required for video processing
     },
-    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
+    cmdclass=cmdclass,
     package_data=package_data,
     entry_points={
         "console_scripts": [
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 7ef502abee345..aa11524812cdd 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -7,7 +7,6 @@
 from torch import nn
 from torch.library import Library
 
-from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
@@ -81,6 +80,7 @@ def test_simple_piecewise_compile():
         use_cudagraph=True,
         splitting_ops=["silly.attention"],
         cudagraph_copy_inputs=True,
+        cudagraph_capture_sizes=[1, 2],
     ))
     with set_current_vllm_config(vllm_config):
         model = SillyModel(vllm_config=vllm_config, prefix='')
@@ -96,11 +96,10 @@ def test_simple_piecewise_compile():
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
 
-        with set_compile_context([1, 2]):
-            model(inputs)
+        model(inputs)
 
-            model(torch.randn(2).cuda())
-            model(torch.randn(1).cuda())
+        model(torch.randn(2).cuda())
+        model(torch.randn(1).cuda())
 
         input = torch.zeros(2).cuda()
         global global_counter
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index dbd5a3bbffeab..07c10a3a18c55 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -13,7 +13,6 @@
 from torch import nn
 from torch.library import Library
 
-from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
@@ -256,6 +255,7 @@ def run_model(llama_config,
         compilation_config = CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
+            cudagraph_capture_sizes=[1, 2],
         )
         if split_attn:
             compilation_config.splitting_ops = ["silly.attention"]
@@ -273,10 +273,9 @@ def run_model(llama_config,
     input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
     positions = torch.arange(B).cuda()
 
-    with set_compile_context([1, 2]):
-        model(input_ids, positions)
-        model(input_ids[:2], positions[:2])
-        model(input_ids[:1], positions[:1])
+    model(input_ids, positions)
+    model(input_ids[:2], positions[:2])
+    model(input_ids[:1], positions[:1])
 
     input_ids[:2].zero_()
     output = model(input_ids[:2], positions[:2])
@@ -379,10 +378,13 @@ def benchmark():
                 level=CompilationLevel.PIECEWISE,
                 use_cudagraph=True,
                 splitting_ops=["silly.attention"],
+                cudagraph_capture_sizes=cudagraph_sizes,
             )
         else:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE, )
+                level=CompilationLevel.PIECEWISE,
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
 
         vllm_config = VllmConfig(compilation_config=compilation_config)
         with set_current_vllm_config(vllm_config):
@@ -396,17 +398,16 @@ def benchmark():
 
         graphs = {}
 
-        with set_compile_context(cudagraph_sizes):
-            model(input_ids, positions)
-            for b in cudagraph_sizes[::-1]:
-                if not piecewise:
-                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph, pool=pool):
-                        output = model(input_ids[:b], positions[:b])
-                    graphs[b] = (graph, output)
-                else:
+        model(input_ids, positions)
+        for b in cudagraph_sizes[::-1]:
+            if not piecewise:
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph, pool=pool):
                     output = model(input_ids[:b], positions[:b])
-                    graphs[b] = (model, output)
+                graphs[b] = (graph, output)
+            else:
+                output = model(input_ids[:b], positions[:b])
+                graphs[b] = (model, output)
         for b in cudagraph_sizes:
             if piecewise:
                 # noqa is for `Function definition does not bind loop variable`
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 386877e0e0a2c..b818ca921fcb0 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -247,9 +247,19 @@ def _compare_tp(
     *,
     method: Literal["generate", "encode"],
 ):
-    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
-    multi_node_only, trust_remote_code, tokenizer_mode, \
-        load_format, hf_overrides = test_options
+    (
+        tp_size,
+        pp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+    (
+        multi_node_only,
+        trust_remote_code,
+        tokenizer_mode,
+        load_format,
+        hf_overrides,
+    ) = test_options
 
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index fb24d6bc2c100..3e9b0e10a11d8 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -61,8 +61,8 @@ def worker_fn():
                         dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
         tensor = pynccl_comm.all_reduce(tensor)
-    result = tensor.mean().cpu().item()
-    assert result == pynccl_comm.world_size
+    torch.cuda.synchronize()
+    assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -86,12 +86,12 @@ def multiple_allreduce_worker_fn():
         if torch.distributed.get_rank() in [0, 1]:
             tensor = pynccl_comm.all_reduce(tensor)
             tensor = pynccl_comm.all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 4
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 4).cpu().item()
         else:
             tensor = pynccl_comm.all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 2
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -112,12 +112,12 @@ def multiple_allreduce_with_vllm_worker_fn():
         if torch.distributed.get_rank() in [0, 1]:
             tensor = tensor_model_parallel_all_reduce(tensor)
             tensor = tensor_model_parallel_all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 4
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 4).cpu().item()
         else:
             tensor = tensor_model_parallel_all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 2
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -141,10 +141,10 @@ def worker_fn_with_cudagraph():
                 graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
                     enable=True):
             a_out = pynccl_comm.all_reduce(a)
-        pynccl_comm.stream.synchronize()
+        torch.cuda.synchronize()
         graph.replay()
-        pynccl_comm.stream.synchronize()
-        assert a_out.mean().cpu().item() == pynccl_comm.world_size**1
+        torch.cuda.synchronize()
+        assert torch.all(a_out == pynccl_comm.world_size).cpu().item()
 
 
 @worker_fn_wrapper
@@ -170,6 +170,7 @@ def all_gather_worker_fn():
 
     with pynccl_comm.change_state(enable=True):
         pynccl_comm.all_gather(result, tensor)
+    torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
@@ -207,6 +208,7 @@ def reduce_scatter_worker_fn():
 
     with pynccl_comm.change_state(enable=True):
         pynccl_comm.reduce_scatter(result, tensor)
+    torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
@@ -241,8 +243,8 @@ def send_recv_worker_fn():
             pynccl_comm.recv(tensor,
                              src=(pynccl_comm.rank - 1) %
                              pynccl_comm.world_size)
-    result = tensor.mean().cpu().item()
-    assert result == 1
+    torch.cuda.synchronize()
+    assert torch.all(tensor == 1).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -280,11 +282,11 @@ def multiple_send_recv_worker_fn():
             pynccl_comm.recv(tensor,
                              src=(pynccl_comm.rank - 1) %
                              pynccl_comm.world_size)
-    result = tensor.mean().cpu().item()
+    torch.cuda.synchronize()
     if torch.distributed.get_rank() in [0, 2]:
-        assert result == 1
+        assert torch.all(tensor == 1).cpu().item()
     else:
-        assert result == 2
+        assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -293,6 +295,38 @@ def test_pynccl_multiple_send_recv():
     distributed_run(multiple_send_recv_worker_fn, 4)
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_broadcast():
+    distributed_run(broadcast_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def broadcast_worker_fn():
+    # Test broadcast for every root rank.
+    # Essentially this is an all-gather operation.
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    recv_tensors = [
+        torch.empty(16,
+                    1024,
+                    1024,
+                    dtype=torch.float32,
+                    device=pynccl_comm.device)
+        for i in range(pynccl_comm.world_size)
+    ]
+    recv_tensors[pynccl_comm.rank] = torch.ones(
+        16, 1024, 1024, dtype=torch.float32,
+        device=pynccl_comm.device) * pynccl_comm.rank
+
+    for i in range(pynccl_comm.world_size):
+        pynccl_comm.broadcast(recv_tensors[i], src=i)
+        # the broadcast op might be launched in a different stream
+        # need to synchronize to make sure the tensor is ready
+        torch.cuda.synchronize()
+        assert torch.all(recv_tensors[i] == i).cpu().item()
+
+
 def test_ncclGetUniqueId():
     lib = NCCLLibrary()
     unique_id = lib.ncclGetUniqueId()
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index de78d41ad12eb..4e269de9fc40b 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -50,12 +50,12 @@ def test_compilation_config():
     args = parser.parse_args(["-O=3"])
     assert args.compilation_config.level == 3
 
-    # set to json
-    args = parser.parse_args(["--compilation-config", '{"level": 3}'])
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
     assert args.compilation_config.level == 3
 
-    # set to json
-    args = parser.parse_args(['--compilation-config={"level": 3}'])
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config={'level': 3}"])
     assert args.compilation_config.level == 3
 
 
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index e7ef5637c8ccb..0f7d15e1d85aa 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -69,6 +69,37 @@ def sample_json_schema():
     }
 
 
+@pytest.fixture
+def sample_complex_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "score": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 100  # Numeric range
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$"  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+            },
+            "tags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "pattern":
+                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
+                }
+            }
+        },
+        "required": ["score", "grade", "email", "tags"]
+    }
+
+
 @pytest.fixture
 def sample_guided_choice():
     return [
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 67c79415f322a..de6257cfc551c 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -76,6 +76,34 @@ def test_guided_json_completion(sample_json_schema, llm):
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+def test_guided_complex_json_completion(sample_complex_json_schema, llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an assignment grade "
+        f"that fits this schema: {sample_complex_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_complex_json_schema)
+
+
 @pytest.mark.skip_global_cleanup
 def test_guided_choice_completion(sample_guided_choice, llm):
     sampling_params = SamplingParams(
@@ -159,3 +187,30 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
                      sampling_params=sampling_params,
                      use_tqdm=True,
                      guided_options_request=dict(guided_regex=sample_regex))
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_json_object(llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        guided_decoding=GuidedDecodingParams(json_object=True))
+
+    outputs = llm.generate(
+        prompts=("Generate a JSON object describing a person with name "
+                 "and age for John Smith who is 31 years old."),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 157d873a75b4d..a0b6edd566561 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -89,7 +89,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=772, total_tokens=782)
+        completion_tokens=10, prompt_tokens=775, total_tokens=785)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -181,7 +181,7 @@ async def test_single_chat_session_image_base64encoded(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=772, total_tokens=782)
+        completion_tokens=10, prompt_tokens=775, total_tokens=785)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index d0c43b47bf0af..425f2a10ec855 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -95,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings["data"]) == 1
     assert len(embeddings["data"][0]["embedding"]) == 3072
     assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 762
-    assert embeddings["usage"]["total_tokens"] == 762
+    assert embeddings["usage"]["prompt_tokens"] == 765
+    assert embeddings["usage"]["total_tokens"] == 765
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
index f9b11018288be..51be2425d7dd7 100644
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@@ -149,13 +149,14 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
 @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
 @pytest.mark.parametrize("silu_activation", [True])
 @pytest.mark.parametrize("has_bias", [True])
+@pytest.mark.parametrize("has_initial_state", [True, False])
 @pytest.mark.parametrize("width", [4])
 @pytest.mark.parametrize(
     'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
 @pytest.mark.parametrize('dim', [64])
 @pytest.mark.parametrize('batch', [1])
 def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
-                       itype):
+                       has_initial_state, itype):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
@@ -167,11 +168,18 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
 
     weight = torch.randn(dim, width, device=device, dtype=itype)
     bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
-    initial_states = torch.randn(batch,
-                                 dim,
-                                 width - 1,
-                                 device=device,
-                                 dtype=itype)
+    if has_initial_state:
+        initial_states = torch.randn(batch,
+                                     dim,
+                                     width - 1,
+                                     device=device,
+                                     dtype=itype)
+        has_initial_state_tensor = torch.ones(batch,
+                                              dtype=torch.bool,
+                                              device=x.device)
+    else:
+        initial_states = None
+        has_initial_state_tensor = None
     x_ref = x.clone()
     weight_ref = weight.clone()
     bias_ref = bias.clone() if bias is not None else None
@@ -183,9 +191,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
                            bias,
                            activation=activation,
                            conv_states=initial_states,
-                           has_initial_state=torch.ones(batch,
-                                                        dtype=torch.bool,
-                                                        device=x.device))
+                           has_initial_state=has_initial_state_tensor)
     out_ref, final_states_ref = causal_conv1d_ref(
         x_ref,
         weight_ref,
@@ -193,11 +199,12 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
         initial_states=initial_states_ref,
         return_final_states=True,
         activation=activation)
-    assert initial_states is not None and final_states_ref is not None
-    assert torch.allclose(initial_states,
-                          final_states_ref,
-                          rtol=rtol,
-                          atol=atol)
+    if has_initial_state:
+        assert initial_states is not None and final_states_ref is not None
+        assert torch.allclose(initial_states,
+                              final_states_ref,
+                              rtol=rtol,
+                              atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
     causal_conv1d_opcheck_fn(x,
@@ -205,9 +212,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
                              bias,
                              activation=activation,
                              conv_states=initial_states,
-                             has_initial_state=torch.ones(batch,
-                                                          dtype=torch.bool,
-                                                          device=x.device))
+                             has_initial_state=has_initial_state_tensor)
 
 
 @pytest.mark.parametrize("itype", [torch.bfloat16])
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 15e576cb065c7..fb8c0b2a7ba26 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -28,7 +28,7 @@
 # yapf: enable
 from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
                               PackedLoRALayerWeights)
-from vllm.lora.punica import PunicaWrapper
+from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -48,11 +48,12 @@
     torch.float32: (5e-3, 5e-3),
     torch.bfloat16: (3e-2, 2e-2),
 }
-CUDA_DEVICES = [
+# TODO: Modify this based on platform
+DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
-# We will launch different triton kernels between the prefill and decode
+#For GPU, we will launch different triton kernels between the prefill and decode
 # stages, so we need to verify this. prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
@@ -192,9 +193,18 @@ def create_random_inputs(
     return inputs, index_mapping, prompt_mapping
 
 
+def check_punica_wrapper(punica_wrapper) -> bool:
+    if current_platform.is_cuda_alike():
+        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
+
+        return type(punica_wrapper) is PunicaWrapperGPU
+    else:
+        return False
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
 def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@@ -205,7 +215,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -296,7 +307,7 @@ def create_random_embedding_layer():
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
@@ -305,7 +316,8 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
     torch.cuda.set_device(device)
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -432,7 +444,7 @@ def create_random_embedding_layer():
 
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
 def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
@@ -441,7 +453,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
     torch.cuda.set_device(device)
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -563,17 +576,21 @@ def _pretest():
 
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
+@pytest.mark.parametrize("bias_enabled", [True, False])
+def test_linear_replicated(dist_init, num_loras, device, stage,
+                           bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_random_linear_replicated_layer():
 
@@ -585,7 +602,12 @@ def create_random_linear_replicated_layer():
         lora_linear = ReplicatedLinearWithLoRA(linear)
 
         lora_linear.create_lora_weights(max_loras, lora_config)
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == 1)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -667,19 +689,22 @@ def create_random_linear_replicated_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage) -> None:
+                         device, stage, bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
@@ -700,7 +725,12 @@ def create_random_linear_parallel_layer():
                            if not fully_shard else
                            ColumnParallelLinearWithShardedLoRA(linear))
         lora_linear.create_lora_weights(max_loras, lora_config)
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == 1)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -782,19 +812,22 @@ def create_random_linear_parallel_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage) -> None:
+                                device, stage, bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
@@ -832,10 +865,16 @@ class FakeConfig:
             num_key_value_heads = 32
             num_attention_heads = 32
 
+        n_slices = repeats
         lora_linear.create_lora_weights(max_loras,
                                         lora_config,
                                         model_config=FakeConfig())
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == n_slices)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -911,7 +950,6 @@ class FakeConfig:
             512,
             lora_config.lora_extra_vocab_size,
         )
-        # lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -941,7 +979,8 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     seed = 0
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index aae6310a2a213..d3ca7f878191a 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -55,15 +55,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@fork_new_process_for_each_test
-def test_llama_lora(sql_lora_files):
-
-    llm = vllm.LLM(MODEL_PATH,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   tensor_parallel_size=1)
-
+def generate_and_test(llm, sql_lora_files):
     print("lora adapter created")
     assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
@@ -79,6 +71,17 @@ def test_llama_lora(sql_lora_files):
     print("removing lora")
 
 
+@fork_new_process_for_each_test
+def test_llama_lora(sql_lora_files):
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=1)
+    generate_and_test(llm, sql_lora_files)
+
+
 @fork_new_process_for_each_test
 def test_llama_lora_warmup(sql_lora_files):
     """Test that the LLM initialization works with a warmup LORA path and
@@ -118,20 +121,7 @@ def test_llama_lora_tp4(sql_lora_files):
         max_loras=4,
         tensor_parallel_size=4,
     )
-
-    print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
-
-    print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
-
-    print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
-
-    print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
-
-    print("removing lora")
+    generate_and_test(llm, sql_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
@@ -146,16 +136,20 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         tensor_parallel_size=4,
         fully_sharded_loras=True,
     )
-    print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
-
-    print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
+    generate_and_test(llm, sql_lora_files)
 
-    print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
-    print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
 
-    print("removing lora")
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=4,
+        fully_sharded_loras=True,
+        enable_lora_bias=True,
+    )
+    generate_and_test(llm, sql_lora_files)
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index daa39b2a3dba1..d225a3f7d6c06 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -17,6 +17,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
         tokenizer_id="gpt2",
         enable_lora=True,
         max_num_seqs=1,
+        max_loras=1,
         max_input_length=None,
     )
     lora_request = LoRARequest("1", 1, sql_lora_files)
@@ -53,3 +54,22 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path):
     lora_request = LoRARequest("1", 1, str(tmp_path))
     tokenizer = get_lora_tokenizer(lora_request)
     assert not tokenizer
+
+
+@pytest.mark.parametrize("enable_lora", [True, False])
+@pytest.mark.parametrize("max_num_seqs", [1, 2])
+@pytest.mark.parametrize("max_loras", [1, 2])
+def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
+    tokenizer_group = get_tokenizer_group(
+        get_tokenizer_pool_config(None),
+        tokenizer_id="gpt2",
+        enable_lora=enable_lora,
+        max_num_seqs=max_num_seqs,
+        max_loras=max_loras,
+        max_input_length=None,
+    )
+    if enable_lora:
+        assert tokenizer_group.lora_tokenizers.capacity == max(
+            max_num_seqs, max_loras)
+    else:
+        assert tokenizer_group.lora_tokenizers.capacity == 0
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 45fab8e96b968..9f4d81b583141 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -36,7 +36,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
 async def test_guided_logits_processor_black_box(backend: str, sample_regex,
                                                  sample_json_schema):
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 87a05b3011393..cae25ae9fa2c8 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,8 +1,8 @@
 import pytest
 
 from tests.utils import multi_gpu_test
+from vllm.config import VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.worker.model_runner import _get_graph_batch_size
 
 from ...utils import check_outputs_equal
 
@@ -189,7 +189,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+            len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
     try:
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 01e208347bff4..35018c3c14dee 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -5,8 +5,8 @@
 import pytest
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from vllm.config import VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.worker.model_runner import _get_graph_batch_size
 
 from ...utils import check_outputs_equal
 
@@ -200,7 +200,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+            len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
     try:
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index 60a8f63eb5faa..c16192a1e1438 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -2,12 +2,10 @@
 from typing import Optional
 
 import pytest
-import torch
-from transformers import AutoImageProcessor, AutoTokenizer
+from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, token_inputs
+from vllm.inputs import InputContext, InputProcessingContext
 from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
-from vllm.multimodal import MultiModalRegistry
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -17,15 +15,9 @@
 
 # Wrap lazy imports to avoid initializing CUDA during test collection
 @pytest.fixture()
-def input_processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import input_processor_for_phi3v
-    return input_processor_for_phi3v
-
-
-@pytest.fixture()
-def dummy_data_for_phi3v():
-    from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
-    return dummy_data_for_phi3v
+def processor_for_phi3v():
+    from vllm.model_executor.models.phi3v import Phi3VProcessor
+    return Phi3VProcessor
 
 
 @pytest.fixture()
@@ -34,53 +26,6 @@ def get_max_phi3v_image_tokens():
     return get_max_phi3v_image_tokens
 
 
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops", [4, 16, None])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
-                               num_crops: Optional[int]):
-    """Ensure that the [default] input mapper handles num_crops properly."""
-    # We pass the processor kwargs here since for this model, we fall back to
-    # the default mapper; this will fall back to the HF mapper and forward
-    # mm_processor_kwargs to it.
-    mm_processor_kwargs = {
-        "num_crops": num_crops
-    } if num_crops is not None else {}
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-
-    hf_processor = AutoImageProcessor.from_pretrained(model,
-                                                      trust_remote_code=True,
-                                                      **mm_processor_kwargs)
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image
-    hf_result = hf_processor.preprocess(
-        image,
-        return_tensors="pt",
-    )
-
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-
-    assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
-    assert torch.all(
-        hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
-
-    # For pixel values, the second axis should be the num_crops + 1
-    # for the rescaled original image. The default value in VLLM falls
-    # back to the HF config, which is why we compare to the processor num_crops
-    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
-    assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
-
-
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("num_crops,expected_max_tokens", [
     (4, 781),
@@ -112,48 +57,20 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
-    (4, 781, 1),
-    (4, 781, 2),
-    (16, 2653, 1),
-    (16, 2653, 2),
-])
-def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
-                             toks_per_img: int, num_imgs: int):
-    """Ensure dummy_data_for_phi3v handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    dummy_data = dummy_data_for_phi3v(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        num_crops=num_crops,
-    )
-    sequence_data = dummy_data.seq_data
-    # Ensure we have the right number of placeholders per num_crops size
-    img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
-    assert img_tok_count == toks_per_img * num_imgs
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
-    (4, 757, 1),
-    (4, 757, 2),
-    (16, 1921, 1),
-    (16, 1921, 2),
-])
-def test_input_processor_override(input_processor_for_phi3v,
-                                  image_assets: _ImageAssets, model: str,
-                                  num_crops: int, expected_toks_per_img: int,
-                                  num_imgs: int):
+@pytest.mark.parametrize(
+    "num_crops,expected_toks_per_img,num_imgs",
+    [
+        (4, 757, 1),
+        (4, 757, 2),
+        (16, 1921, 1),
+        (16, 1921, 2),
+        # the default num_crops of phi-3.5-vision is 4
+        (None, 757, 2),
+        (None, 757, 2),
+    ])
+def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
+                            model: str, num_crops: Optional[int],
+                            expected_toks_per_img: int, num_imgs: int):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
@@ -163,19 +80,20 @@ def test_input_processor_override(input_processor_for_phi3v,
         tokenizer_name=model,
         trust_remote_code=True,
     )
-    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
     images = [image_assets[0].pil_image] * num_imgs
 
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
+    mm_data = {"image": images}
+    mm_processor_kwargs = {}
+    if num_crops is not None:
+        mm_processor_kwargs = {"num_crops": num_crops}
 
-    processed_inputs = input_processor_for_phi3v(ctx,
-                                                 inputs,
-                                                 num_crops=num_crops)
+    processor = processor_for_phi3v(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index dbb0b4d350d10..ed8f34a677f84 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -6,7 +6,6 @@
 from typing import Type
 
 import pytest
-import transformers
 from transformers import AutoModelForVision2Seq
 from transformers.utils import is_flash_attn_2_available
 
@@ -35,7 +34,7 @@
     "dtype": "half",
     "max_tokens": 5,
     "tensor_parallel_size": 2,
-    "model_kwargs": {"device_map": "auto"},
+    "hf_model_kwargs": {"device_map": "auto"},
     "image_size_factors": [(.25, 0.5, 1.0)],
     "distributed_executor_backend": (
         "ray",
@@ -109,7 +108,7 @@
             "cherry_blossom": "What is in the picture?",
         }),
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
@@ -152,7 +151,7 @@
             "cherry_blossom": "<vlm_image>Please infer the season with reason.",
         }),
         multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-        postprocess_inputs=model_utils.get_key_type_post_processor("pixel_values"),
+        postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"),
         stop_str=["<|im_end|>"],
         image_size_factors=[(0.10, 0.15)],
         max_tokens=64,
@@ -178,7 +177,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         # For chameleon, we only compare the sequences
@@ -187,12 +186,6 @@
         comparator=check_outputs_equal,
         max_tokens=8,
         dtype="bfloat16",
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.2",
-                reason="Model broken in HF, see huggingface/transformers#34379"
-            ),
-        ]
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -243,13 +236,7 @@
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.0",
-                reason="Model introduced in HF >= 4.46.0"
-            ),
-            large_gpu_mark(min_gb=48),
-        ],
+        marks=[large_gpu_mark(min_gb=48)],
     ),
     "intern_vl": VLMTestInfo(
         models=[
@@ -294,7 +281,7 @@
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values_videos"
         ),
         auto_cls=AutoModelForVision2Seq,
@@ -318,12 +305,20 @@
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
         image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.2",
-                reason="Model broken with changes in transformers 4.46"
-            )
-        ],
+    ),
+    "mantis": VLMTestInfo(
+        models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        max_model_len=4096,
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
+            "pixel_values"
+        ),
+        vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}},  # noqa: E501
+        get_stop_token_ids=lambda tok: [128009],
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
+        patch_hf_runner=model_utils.mantis_patch_hf_runner,
     ),
     "minicpmv_25": VLMTestInfo(
         models=["openbmb/MiniCPM-Llama3-V-2_5"],
@@ -361,7 +356,7 @@
     #     max_num_seqs=2,
     #     task="generate",
     #     # use eager mode for hf runner since phi3v didn't work with flash_attn
-    #     model_kwargs={"_attn_implementation": "eager"},
+    #     hf_model_kwargs={"_attn_implementation": "eager"},
     #     use_tokenizer_eos=True,
     #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
     #     num_logprobs=10,
@@ -392,7 +387,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
@@ -404,10 +399,6 @@
                 cuda_device_count_stateless() < 2,
                 reason="Need at least 2 GPUs to run the test.",
             ),
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.2",
-                reason="Model broken in HF, see huggingface/transformers#34379"
-            )
         ],
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
@@ -461,7 +452,7 @@
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
         max_num_seqs=2,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         auto_cls=AutoModelForVision2Seq,
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 6233860747b9c..90c0fab99054c 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -228,7 +228,7 @@ def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
                          name_1="output")
 
 
-@large_gpu_test(min_gb=24)
+@large_gpu_test(min_gb=48)
 @pytest.mark.parametrize(
     "prompt,expected_ranges",
     [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 88349ef9a3a69..54b7b0733210f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -3,9 +3,11 @@
 
 import torch
 from PIL.Image import Image
-from transformers import AutoTokenizer, BatchEncoding
+from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
+from vllm.config import TaskOption
+
 from .....conftest import HfRunner, VllmRunner
 from .types import RunnerOutput
 
@@ -28,13 +30,15 @@ def run_test(
     use_tokenizer_eos: bool,
     postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
     comparator: Callable[..., None],
-    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
+    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+                                          List[int]]],
     stop_str: Optional[List[str]],
     tokenizer_mode: str,
     limit_mm_per_prompt: Dict[str, int],
-    model_kwargs: Optional[Dict[str, Any]],
+    vllm_runner_kwargs: Optional[Dict[str, Any]],
+    hf_model_kwargs: Optional[Dict[str, Any]],
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
-    task: str = "auto",
+    task: TaskOption = "auto",
     runner_mm_key: str = "images",
     distributed_executor_backend: Optional[str] = None,
     tensor_parallel_size: int = 1,
@@ -58,6 +62,9 @@ def run_test(
     if stop_str:
         vllm_kwargs["stop"] = stop_str
 
+    if vllm_runner_kwargs is None:
+        vllm_runner_kwargs = {}
+
     with vllm_runner(model,
                      tokenizer_mode=tokenizer_mode,
                      max_model_len=max_model_len,
@@ -67,7 +74,8 @@ def run_test(
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=enforce_eager,
-                     task=task) as vllm_model:
+                     task=task,
+                     **vllm_runner_kwargs) as vllm_model:
         for prompts, media in vllm_inputs:
             vllm_kwargs[runner_mm_key] = media
             vllm_output = vllm_model.generate_greedy_logprobs(
@@ -78,7 +86,7 @@ def run_test(
                          dtype=dtype,
                          auto_cls=auto_cls,
                          postprocess_inputs=postprocess_inputs,
-                         model_kwargs=model_kwargs)
+                         model_kwargs=hf_model_kwargs)
 
     # Some models need to patch things like the model processor, e.g., internvl
     if patch_hf_runner is not None:
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 15f15dd7d8030..3eca8fb9dcb1a 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -126,6 +126,16 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    """Sanitize vllm output [mantis] to compare with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|eot_id|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
 def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
                             model: str) -> RunnerOutput:
     """Sanitize vllm output [phi3v] to be comparable with hf output."""
@@ -184,7 +194,7 @@ def get_llava_embeddings(image_assets: _ImageAssets):
 
 
 ####### postprocessors to run on HF BatchEncoding
-def get_key_type_post_processor(
+def cast_dtype_post_processor(
         hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
     """Gets a handle to a post processor which converts a given key into a
     target data type."""
@@ -418,3 +428,26 @@ def _internvl_generate(
     )
 
     return outputs
+
+
+def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    from mantis.models.mllava import MLlavaProcessor
+
+    hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name)
+
+    orig_generate = hf_model.model.generate
+    tokenizer = hf_model.processor.tokenizer
+
+    def _generate(self, *args, **kwargs):
+        return orig_generate(
+            *args,
+            **kwargs,
+            eos_token_id=[
+                tokenizer.eos_token_id,
+                tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+            ],
+        )
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index d410fa8c653ce..e2e0c6390fcb9 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -7,9 +7,11 @@
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
+from transformers import (AutoModelForCausalLM, BatchEncoding,
+                          PreTrainedTokenizerBase)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
+from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
 from vllm.utils import identity
 
@@ -66,7 +68,7 @@ class ImageSizeWrapper(NamedTuple):
 class VLMTestInfo(NamedTuple):
     """Holds the configuration for 1+ tests for one model architecture."""
 
-    models: Union[List[str]]
+    models: List[str]
     test_type: Union[VLMTestType, Iterable[VLMTestType]]
 
     # Should be None only if this is a CUSTOM_INPUTS test
@@ -92,18 +94,20 @@ class VLMTestInfo(NamedTuple):
     enforce_eager: bool = True
     max_model_len: int = 1024
     max_num_seqs: int = 256
-    task: str = "auto"
+    task: TaskOption = "auto"
     tensor_parallel_size: int = 1
+    vllm_runner_kwargs: Optional[Dict[str, Any]] = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
+    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+                                          List[int]]] = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
     stop_str: Optional[List[str]] = None
 
     # Exposed options for HF runner
-    model_kwargs: Optional[Dict[str, Any]] = None
-    # Indicates we should explicitly pass the EOS from the tokeniezr
+    hf_model_kwargs: Optional[Dict[str, Any]] = None
+    # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
     auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
     # Callable to pass to the HF runner to run on inputs; for now, we also pass
@@ -164,6 +168,7 @@ def get_non_parametrized_runner_kwargs(self):
             "max_num_seqs": self.max_num_seqs,
             "task": self.task,
             "tensor_parallel_size": self.tensor_parallel_size,
+            "vllm_runner_kwargs": self.vllm_runner_kwargs,
             "hf_output_post_proc": self.hf_output_post_proc,
             "vllm_output_post_proc": self.vllm_output_post_proc,
             "auto_cls": self.auto_cls,
@@ -171,8 +176,8 @@ def get_non_parametrized_runner_kwargs(self):
             "postprocess_inputs": self.postprocess_inputs,
             "comparator": self.comparator,
             "get_stop_token_ids": self.get_stop_token_ids,
+            "hf_model_kwargs": self.hf_model_kwargs,
             "stop_str": self.stop_str,
-            "model_kwargs": self.model_kwargs,
             "patch_hf_runner": self.patch_hf_runner,
             "tokenizer_mode": self.tokenizer_mode
         }
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 461f453d8b1c3..a89518820045f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -176,6 +176,7 @@ class _HfExamplesInfo:
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
+    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"),  # noqa: E501
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 2a072737db043..3b728f2744fca 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,7 +1,6 @@
 from unittest.mock import patch
 
 import pytest
-import transformers
 from transformers import PretrainedConfig
 
 from vllm import LLM
@@ -11,10 +10,6 @@
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
-    if (model_arch in {"Idefics3ForConditionalGeneration", "GlmForCausalLM"}
-            and transformers.__version__ < "4.46.0"):
-        pytest.skip(reason="Model introduced in HF >= 4.46.0")
-
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 13ad4a7966b9d..71832acbd17b8 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from transformers import CLIPImageProcessor, LlavaNextImageProcessor
+from transformers import LlavaNextImageProcessor
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MultiModalRegistry
@@ -14,49 +14,6 @@ def mm_registry():
     return MultiModalRegistry()
 
 
-@pytest.mark.parametrize("dtype", ["half", "float"])
-@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
-def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-
-    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
-    assert isinstance(hf_processor, CLIPImageProcessor)
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    for asset in image_assets:
-        image = rescale_image_size(asset.pil_image, size_factor)
-
-        hf_result = hf_processor.preprocess(
-            image,
-            return_tensors="pt",
-        )
-        vllm_result = mm_registry.map_input(
-            model_config,
-            {"image": image},
-        )
-
-        assert hf_result.keys() == vllm_result.keys()
-        for key, hf_tensor in hf_result.items():
-            hf_arr: np.ndarray = hf_tensor.numpy()
-            vllm_arr: np.ndarray = vllm_result[key].numpy()
-
-            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
-            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
 @pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
 def test_llava_next_image_processor(image_assets, mm_registry, dtype,
@@ -107,7 +64,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
      (2, 1, False), (2, 2, True)],
 )
 def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
 
     model_config = ModelConfig(
         model=MODEL_NAME,
@@ -138,7 +95,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
 # NOTE: We don't test zero images since the HF processor doesn't support it
 @pytest.mark.parametrize("num_images", [1, 2])
 def test_image_mapper_multi(image_assets, mm_registry, num_images):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
 
     model_config = ModelConfig(
         model=MODEL_NAME,
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index b2367060c6c1b..ae668d1dd56c8 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -3,50 +3,15 @@
 import pytest
 from transformers import BatchFeature
 
-from vllm.multimodal.processing import (PromptReplacement, find_text_matches,
-                                        find_token_matches, iter_token_matches,
-                                        iter_token_runs, replace_text_matches)
+from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
+                                        find_text_matches, find_token_matches,
+                                        iter_placeholders, iter_token_matches,
+                                        replace_text_matches,
+                                        replace_token_matches)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
 
-# yapf: disable
-@pytest.mark.parametrize(
-    ("token_ids", "expected"),
-    [
-        ([], []),
-        (
-            [32000, 32000, 32000],
-            [{ "token_id": 32000, "start_idx": 0, "length": 3 }],
-        ),
-        (
-            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
-            [
-                { "token_id": 9833, "start_idx": 0, "length": 1 },
-                { "token_id": 28747, "start_idx": 1, "length": 1 },
-                { "token_id": 32000, "start_idx": 2, "length": 3 },
-                { "token_id": 9833, "start_idx": 5, "length": 1 },
-                { "token_id": 28747, "start_idx": 6, "length": 1 },
-                { "token_id": 32000, "start_idx": 7, "length": 2 },
-                { "token_id": 918, "start_idx": 9, "length": 1 },
-            ],
-        ),
-    ],
-)
-# yapf: enable
-def test_iter_token_runs(token_ids, expected):
-    result = list(iter_token_runs(token_ids))
-
-    # Only displayed on error
-    print("result:", result)
-
-    # Manually constructed results
-    assert [item._asdict() for item in result] == expected
-
-    # Invariants
-    assert sum(run_info.length for run_info in result) == len(token_ids)
-
-
 # yapf: disable
 @pytest.mark.parametrize(
     ("token_ids", "match_ids", "expected"),
@@ -170,13 +135,11 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
     # Should not be used since there is nothing to convert to token IDs
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    result = find_token_matches(
-        prompt,
-        [
-            PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
-            for key, target in target_by_key.items()
-        ],
-    )
+    prompt_repls = [
+        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    result = find_token_matches(prompt, prompt_repls)
 
     # Only displayed on error
     print("result:", result)
@@ -279,13 +242,11 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
     # Should not be used since there is nothing to convert to text
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    result = find_text_matches(
-        prompt,
-        [
-            PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
-            for key, target in target_by_key.items()
-        ],
-    )
+    prompt_repls = [
+        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    result = find_text_matches(prompt, prompt_repls)
 
     # Only displayed on error
     print("result:", result)
@@ -303,7 +264,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
 
 # yapf: disable
 @pytest.mark.parametrize(
-    ("prompt", "target_by_key", "repl_by_key", "expected_by_mm_count"),
+    ("prompt", "target_by_key", "repl_by_key"),
     [
         (
             "Image:<image>Image:<image><image>!",
@@ -322,49 +283,201 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
                 # Test multiple repl_count
                 "pattern_3": ("?", 2),
             },
-            {
-                # Test no replacement
-                0: "Image:<image>Image:<image><image>!",
-                # Test single replacement
-                1: "<image><image>Image:<image><image>??",
-                # Test repeated replacement
-                2: "<image><image><image><image><image>??",
-            },
         ),
     ]
 )
+@pytest.mark.parametrize(
+    ("mm_count", "expected"),
+    [
+        (0, "Image:<image>Image:<image><image>!"),
+        (1, "<image><image>Image:<image><image>??"),
+        (2, "<image><image><image><image><image>??"),
+    ]
+)
 # yapf: enable
 def test_find_replace_text(
     prompt,
     target_by_key,
     repl_by_key,
-    expected_by_mm_count,
+    mm_count,
+    expected,
 ):
     # Should not be used since there is nothing to convert to text
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    matches = find_text_matches(
+    prompt_repls = [
+        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    matches = find_text_matches(prompt, prompt_repls)
+
+    result = replace_text_matches(
         prompt,
-        [
-            PromptReplacement(target, *repl_by_key[key]) \
-                .bind(key, mock_tokenizer)
-            for key, target in target_by_key.items()
-        ],
+        matches,
+        {key: list(range(mm_count))
+         for key in repl_by_key},
+        BatchFeature(),
     )
-    result_by_mm_count = {
-        mm_count: replace_text_matches(
-            prompt,
-            matches,
-            {key: list(range(mm_count))
-             for key in repl_by_key},
-            BatchFeature(),
-        )
-        for mm_count in expected_by_mm_count
-    }
 
     # Only displayed on error
     print("matches:", matches)
-    print("result_by_mm_count:", result_by_mm_count)
+    print("result:", result)
+
+    # Manually constructed results
+    assert result == expected
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "repl_by_key"),
+    [
+        # Tokenized test cases of `test_find_replace_text`
+        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
+        (
+            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+            {
+                # We use `<image>` before `Image:` to test matches that
+                # occur out of order
+                "pattern_1": [32000],
+                "pattern_2": [9833, 28747],
+                "pattern_3": [918],
+            },
+            {
+                # Test whether target is confused with repl_unit
+                "pattern_1": ([32000, 32000], 1),
+                # Test empty repl_unit
+                "pattern_2": ([], 1),
+                # Test multiple repl_count
+                "pattern_3": ([1550], 2),
+            },
+        ),
+    ]
+)
+@pytest.mark.parametrize(
+    ("mm_count", "expected"),
+    [
+        (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
+        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550]),
+        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550]),
+    ]
+)
+# yapf: enable
+def test_find_replace_tokens(
+    prompt,
+    target_by_key,
+    repl_by_key,
+    mm_count,
+    expected,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_repls = [
+        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    matches = find_token_matches(prompt, prompt_repls)
+
+    result = replace_token_matches(
+        prompt,
+        matches,
+        {key: list(range(mm_count))
+         for key in repl_by_key},
+        BatchFeature(),
+    )
+
+    # Only displayed on error
+    print("matches:", matches)
+    print("result:", result)
+
+    # Manually constructed results
+    assert result == expected
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    "repl_by_key",
+    [
+        {
+            "pattern_1": ([32000, 32000], 1),
+            "pattern_2": ([], 1),
+            "pattern_3": ([1550], 2),
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    ("prompt", "expected"),
+    [
+        (
+            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+            [
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=6,
+                    unit=[32000, 32000],
+                    unit_count=1,
+                ),
+            ],
+        ),
+        (
+            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550],
+            [
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=1,
+                    unit=[32000, 32000],
+                    unit_count=1,
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=5,
+                    unit=[32000, 32000],
+                    unit_count=1,
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_3",
+                    start_idx=7,
+                    unit=[1550],
+                    unit_count=2,
+                ),
+            ],
+        ),
+        (
+            [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550],
+            [
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=1,
+                    unit=[32000, 32000],
+                    unit_count=2,
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_3",
+                    start_idx=6,
+                    unit=[1550],
+                    unit_count=2,
+                ),
+            ],
+        ),
+    ]
+)
+def test_iter_placeholders(
+    repl_by_key,
+    prompt,
+    expected,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_repls = [
+        PromptReplacement([], *repl).bind(key, mock_tokenizer)
+        for key, repl in repl_by_key.items()
+    ]
+
+    result = list(iter_placeholders(prompt_repls, prompt))
+
+    # Only displayed on error
+    print("result:", result)
 
     # Manually constructed results
-    assert result_by_mm_count == expected_by_mm_count
+    assert result == expected
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index e6c8793989e13..d141cdf1f083b 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -15,13 +15,13 @@
 # Used for fast tests where the model doesn't matter
 DUMMY_MODEL_ID = "facebook/opt-125m"
 # Used for tests that need a multimodal model
-MULTIMODAL_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+MULTIMODAL_MODEL_ID = "OpenGVLab/InternVL2-2B"
 
 # For mm_processor_kwargs - we test overrides by defining mocks for each place
 # it is used, and ensuring that we can pass processor kwargs an override value
 # to receive the intended result for things like sequence length etc.
-DEFAULT_NUM_CROPS = 4
-NUM_CROPS_OVERRIDE = 16
+DEFAULT_MAX_DYNAMIC_PATCH = 6
+MAX_DYNAMIC_PATCH_OVERRIDE = 4
 
 
 # Mocks for all of the places that we use the mm_processor_kwargs
@@ -33,10 +33,11 @@ def use_processor_mock():
     def custom_processor(ctx: InputContext,
                          inputs: DecoderOnlyInputs,
                          *,
-                         num_crops=DEFAULT_NUM_CROPS):
+                         max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
         # For testing purposes, we don't worry about the prompt
-        return token_inputs(prompt_token_ids=[],
-                            mm_processor_kwargs={"num_crops": num_crops})
+        return token_inputs(
+            prompt_token_ids=[],
+            mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch})
 
     with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor",
                return_value=custom_processor):
@@ -52,9 +53,9 @@ def custom_dummy_data_factory(self,
                                   seq_len: int,
                                   mm_counts: Mapping[str, int],
                                   *,
-                                  num_crops=DEFAULT_NUM_CROPS):
+                                  max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
         seq_data = SequenceData(
-            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops))
+            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * max_dynamic_patch))
         return DummyData(seq_data, None)
 
     with patch(
@@ -65,15 +66,15 @@ def custom_dummy_data_factory(self,
 
 # Lazy import to avoid CUDA reinitialization error
 def mm_model_cls():
-    from vllm.model_executor.models.phi3v import Phi3VForCausalLM
+    from vllm.model_executor.models.internvl import InternVLChatModel
 
-    return Phi3VForCausalLM
+    return InternVLChatModel
 
 
 # lambda whose signature matches max token calcs extra & mapper + extra kwargs
-get_num_crops = lambda ctx, *, num_crops=DEFAULT_NUM_CROPS: num_crops
-custom_mapper = lambda ctx, data, *, num_crops=DEFAULT_NUM_CROPS: {
-    "pixel_values": torch.zeros(size=(1, num_crops + 1, 3, 336, 336))
+get_max_dynamic_patch = lambda ctx, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: max_dynamic_patch  # noqa: E501
+custom_mapper = lambda ctx, data, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: {  # noqa: E501
+    "pixel_values": torch.zeros(size=(1, max_dynamic_patch + 1, 3, 448, 448))
 }
 
 
@@ -88,27 +89,28 @@ def test_default_processor_is_a_noop():
     assert proc_inputs is proc_outputs
 
 
-def _get_num_crops_info(init_num_crops: int, inference_num_crops: int):
-    """Get the init / inference kwargs and expected num_crops for this test."""
-    # If we have a value for num_crops, pass the override value and make
+def _get_max_dynamic_patch_info(init_max_dynamic_patch: int,
+                                inference_max_dynamic_patch: int):
+    """Get the init / inference kwargs and expected max_dynamic_patch."""
+    # If we have a value for max_dynamic_patch, pass the override value and make
     # sure we get that value as a return-value from out mock processor,
     # otherwise fall back to the default value
-    init_kwargs = None if init_num_crops is None else {
-        "num_crops": init_num_crops
+    init_kwargs = None if init_max_dynamic_patch is None else {
+        "max_dynamic_patch": init_max_dynamic_patch
     }
-    inference_kwargs = None if inference_num_crops is None else {
-        "num_crops": inference_num_crops
+    inference_kwargs = None if inference_max_dynamic_patch is None else {
+        "max_dynamic_patch": inference_max_dynamic_patch
     }
-    if inference_num_crops is not None:
-        expected_seq_count = inference_num_crops
-    elif init_num_crops is not None:
-        expected_seq_count = init_num_crops
+    if inference_max_dynamic_patch is not None:
+        expected_seq_count = inference_max_dynamic_patch
+    elif init_max_dynamic_patch is not None:
+        expected_seq_count = init_max_dynamic_patch
     else:
-        expected_seq_count = DEFAULT_NUM_CROPS
+        expected_seq_count = DEFAULT_MAX_DYNAMIC_PATCH
     return init_kwargs, inference_kwargs, expected_seq_count
 
 
-def _get_processed_num_crops(
+def _get_processed_max_dynamic_patch(
     processor: Callable[[ProcessorInputs], ProcessorInputs],
     inference_kwargs: Optional[Dict[str, int]],
 ) -> int:
@@ -120,27 +122,30 @@ def _get_processed_num_crops(
     assert "type" in processed_inputs
     assert processed_inputs["type"] == "token"
     assert "mm_processor_kwargs" in processed_inputs
-    return processed_inputs["mm_processor_kwargs"]["num_crops"]
+    return processed_inputs["mm_processor_kwargs"]["max_dynamic_patch"]
 
 
-@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
-    (None, None),
-    (NUM_CROPS_OVERRIDE, None),
-    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
-])
-def test_input_processor_kwargs(use_processor_mock, init_num_crops,
-                                inference_num_crops):
+@pytest.mark.parametrize(
+    "init_max_dynamic_patch,inference_max_dynamic_patch", [
+        (None, None),
+        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
+        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
+    ])
+def test_input_processor_kwargs(use_processor_mock, init_max_dynamic_patch,
+                                inference_max_dynamic_patch):
     """Ensure input processors can use processor kwargs."""
     dummy_registry = InputRegistry()
 
-    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
-        init_num_crops, inference_num_crops)
+    (init_kwargs, inference_kwargs,
+     expected_seq_count) = _get_max_dynamic_patch_info(
+         init_max_dynamic_patch, inference_max_dynamic_patch)
 
     ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
     processor = dummy_registry.create_input_processor(ctx.model_config)
-    num_crops_val = _get_processed_num_crops(processor, inference_kwargs)
+    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
+        processor, inference_kwargs)
 
-    assert num_crops_val == expected_seq_count
+    assert max_dynamic_patch_val == expected_seq_count
 
 
 @pytest.mark.parametrize(
@@ -165,18 +170,21 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
 
     processor = dummy_registry.create_input_processor(ctx.model_config)
     # Should filter out the inference time kwargs
-    num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs)
-    assert num_crops_val == DEFAULT_NUM_CROPS
+    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
+        processor, mm_processor_kwargs)
+    assert max_dynamic_patch_val == DEFAULT_MAX_DYNAMIC_PATCH
 
 
 ### Test overrides for the dummy data
-@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
-def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops):
+@pytest.mark.parametrize("max_dynamic_patch",
+                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
+def test_dummy_data_kwarg_overrides(use_dummy_data_mock, max_dynamic_patch):
     """Ensure dummy data factories can use processor kwargs."""
-    mm_processor_kwargs = None if num_crops is None else {
-        "num_crops": num_crops
+    mm_processor_kwargs = None if max_dynamic_patch is None else {
+        "max_dynamic_patch": max_dynamic_patch
     }
-    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
+    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
+                          if max_dynamic_patch is None else max_dynamic_patch)
     dummy_registry = InputRegistry()
     ctx = build_model_context(DUMMY_MODEL_ID,
                               mm_processor_kwargs=mm_processor_kwargs)
@@ -217,17 +225,20 @@ def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
     # len is solely dependent on the value of the mm_processor_kwargs.
     dummy_data = dummy_registry.dummy_data_for_profiling(
         ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
+    assert len(
+        dummy_data.seq_data.prompt_token_ids) == DEFAULT_MAX_DYNAMIC_PATCH
 
 
 ### Test overrides for the max token count per multimodal instance
-@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
-def test_max_tokens_kwarg_overrides(num_crops):
+@pytest.mark.parametrize("max_dynamic_patch",
+                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
+def test_max_tokens_kwarg_overrides(max_dynamic_patch):
     """Ensure max token calcs can use processor kwargs."""
-    mm_processor_kwargs = None if num_crops is None else {
-        "num_crops": num_crops
+    mm_processor_kwargs = None if max_dynamic_patch is None else {
+        "max_dynamic_patch": max_dynamic_patch
     }
-    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
+    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
+                          if max_dynamic_patch is None else max_dynamic_patch)
 
     ctx = build_model_context(MULTIMODAL_MODEL_ID,
                               task="generate",
@@ -239,11 +250,11 @@ def test_max_tokens_kwarg_overrides(num_crops):
     mm_registry.init_mm_limits_per_prompt(ctx.model_config)
     # Patch the image registry for phi3v with our lambda that is compatible
     # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
+    # our max_dynamic_patch value back from the mm_processor_kwargs.
     with patch.object(
             mm_registry._get_plugin("image"),
             "_max_mm_tokens",
-        {mm_model_cls(): get_num_crops},
+        {mm_model_cls(): get_max_dynamic_patch},
     ):
         max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
             ctx.model_config)
@@ -279,26 +290,29 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
     with patch.object(
             mm_registry._get_plugin("image"),
             "_max_mm_tokens",
-        {mm_model_cls(): get_num_crops},
+        {mm_model_cls(): get_max_dynamic_patch},
     ):
         max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
             ctx.model_config)
 
-    assert max_multimodal_tokens == DEFAULT_NUM_CROPS
+    assert max_multimodal_tokens == DEFAULT_MAX_DYNAMIC_PATCH
 
 
 ### Test overrides for the mapper
-@pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE])
-def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
+@pytest.mark.parametrize(
+    "max_dynamic_patch",
+    [DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE])
+def test_default_mapper_with_processor_kwargs(image_assets, max_dynamic_patch):
     """Ensure that the mapper processor kwargs can fall back to HF models."""
     # NOTE - we don't validate bad inputs for the default mapper, because it's
     # through the automodel interface in transformers, so we can't easily
     # inspect what kwargs are or are not allowed.
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs={"num_crops": num_crops},
-                              limit_mm_per_prompt={"image": 1})
+    ctx = build_model_context(
+        MULTIMODAL_MODEL_ID,
+        task="generate",
+        trust_remote_code=True,
+        mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch},
+        limit_mm_per_prompt={"image": 1})
 
     mm_registry = MultiModalRegistry()
     mm_registry.init_mm_limits_per_prompt(ctx.model_config)
@@ -307,20 +321,22 @@ def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
     mm_inputs = {"image": image}
 
     mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
-    # Phi3v pixel vals should have shape: [batch, num_crops+1, 3, 336, 336]
-    assert mapped_inputs["pixel_values"].shape[1] == num_crops + 1
+    # pixel vals should have shape: [batch, max_dynamic_patch+1, ...]
+    assert mapped_inputs["pixel_values"].shape[1] == max_dynamic_patch + 1
 
 
-@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
-    (None, None),
-    (NUM_CROPS_OVERRIDE, None),
-    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
-])
-def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
-                                       inference_num_crops):
+@pytest.mark.parametrize(
+    "init_max_dynamic_patch,inference_max_dynamic_patch", [
+        (None, None),
+        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
+        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
+    ])
+def test_custom_mapper_kwarg_overrides(image_assets, init_max_dynamic_patch,
+                                       inference_max_dynamic_patch):
     """Ensure custom mappers can use processor kwargs."""
-    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
-        init_num_crops, inference_num_crops)
+    (init_kwargs, inference_kwargs,
+     expected_seq_count) = _get_max_dynamic_patch_info(
+         init_max_dynamic_patch, inference_max_dynamic_patch)
 
     ctx = build_model_context(MULTIMODAL_MODEL_ID,
                               task="generate",
@@ -335,7 +351,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
 
     # Patch the image registry for phi3v with our lambda that is compatible
     # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
+    # our max_dynamic_patch value back from the mm_processor_kwargs.
     mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
         mm_model_cls())
     mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
@@ -373,11 +389,12 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
 
     # Patch the image registry for phi3v with our lambda that is compatible
     # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
+    # our max_dynamic_patch value back from the mm_processor_kwargs.
     mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
         mm_model_cls())
     # Should filter out the inference time kwargs
     mapped_inputs = mm_registry.map_input(
         ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
 
-    assert mapped_inputs["pixel_values"].shape[1] == DEFAULT_NUM_CROPS + 1
+    assert mapped_inputs["pixel_values"].shape[1] == (
+        DEFAULT_MAX_DYNAMIC_PATCH + 1)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 3ebd7864b8fc8..2f4194a63fc25 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -2,19 +2,15 @@
 
 import torch
 
-from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              dummy_data_for_llava,
-                                              get_max_llava_image_tokens,
-                                              input_processor_for_llava)
+                                              LlavaProcessor,
+                                              get_max_llava_image_tokens)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/tests/test_lazy_torch_compile.py b/tests/standalone_tests/lazy_torch_compile.py
similarity index 100%
rename from tests/test_lazy_torch_compile.py
rename to tests/standalone_tests/lazy_torch_compile.py
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
new file mode 100644
index 0000000000000..f00895c0997f1
--- /dev/null
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# This script tests if the python only compilation works correctly
+# for users who do not have any compilers installed on their system
+
+set -e
+set -x
+
+cd /vllm-workspace/
+
+# uninstall vllm
+pip3 uninstall -y vllm
+# restore the original files
+mv test_docs/vllm ./vllm
+
+# remove all compilers
+apt remove --purge build-essential -y
+apt autoremove -y
+
+echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
+
+VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+
+# Run the script
+python3 -c 'import vllm'
+
+# Check if the clangd log file was created
+if [ ! -f /tmp/changed.file ]; then
+    echo "changed.file was not created, python only compilation failed"
+    exit 1
+fi
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index bd11ff1877064..fef44ac29c41f 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -27,9 +27,8 @@ def make_request() -> EngineCoreRequest:
         request_id=uuid.uuid4(),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
-        mm_data=None,
+        mm_inputs=None,
         mm_placeholders=None,
-        mm_processor_kwargs=None,
         sampling_params=SamplingParams(),
         eos_token_id=None,
         arrival_time=time.time(),
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 582192196aaf9..4e003a25e91d2 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -29,9 +29,8 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         request_id=str(uuid.uuid4()),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
-        mm_data=None,
+        mm_inputs=None,
         mm_placeholders=None,
-        mm_processor_kwargs=None,
         sampling_params=params,
         eos_token_id=None,
         arrival_time=time.time(),
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 9e166ae64dbfb..5289c91f201cd 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -4,12 +4,12 @@
 import pytest
 import torch
 
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
-from vllm.worker.model_runner import _get_graph_batch_size
 
 BATCH_SIZES = [1, 4, 16, 64, 256]
 
@@ -548,7 +548,7 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     # With CUDA Graph capture and replay enabled, the decoder and encoder
     # input sequences will be padded. Create the expected padded tensors
     # accordingly.
-    graph_batch_size = _get_graph_batch_size(expanded_batch_size)
+    graph_batch_size = VllmConfig.get_graph_batch_size(expanded_batch_size)
     cuda_graph_pad_size = graph_batch_size - expanded_batch_size
     padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
     padded_encoder_seq_lens = encoder_seq_lens + list(
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 433a9b30ba57a..4055524f3e0c7 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -3,13 +3,14 @@
 import pytest
 import torch
 
+from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
                                              init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import get_open_port
-from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
+from vllm.worker.model_runner import ModelRunner
 
 
 def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
@@ -176,7 +177,7 @@ def test_prepare_decode_cuda_graph(batch_size):
         model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
-    expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
+    expected_bs = VllmConfig.get_graph_batch_size(len(seq_group_metadata_list))
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.num_prefills == 0
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 5471eec881d85..b3065495ab396 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -111,8 +111,16 @@ def __init__(
         self.matmul_qk = Matmul()
         self.softmax = Softmax()
         self.matmul_av = Matmul()
+        self.batch2block_matmul = Matmul()
+        self.block2batch_matmul = Matmul()
+        # NOTE(kzawora): Contiguous PA is off until model runner supports it
         self.k_cache = VLLMKVCache()
+        self.k_cache.use_contiguous_pa = False
         self.v_cache = VLLMKVCache()
+        self.v_cache.use_contiguous_pa = False
+        # NOTE(kzawora): Pipelined PA is off until model runner supports it
+        ops.pa_impl = ops.pa
+
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.sliding_window = sliding_window
         self.alibi_slopes = alibi_slopes
@@ -229,9 +237,12 @@ def forward(
                 block_mapping=attn_metadata.block_mapping,
                 block_bias=attn_metadata.attn_bias,
                 block_scales=attn_metadata.block_scales,
+                block_groups=None,
                 scale=self.scale,
                 matmul_qk_op=self.matmul_qk,
                 matmul_av_op=self.matmul_av,
+                batch2block_matmul_op=self.batch2block_matmul,
+                block2batch_matmul_op=self.block2batch_matmul,
                 keys_fetch_func=self.k_cache.fetch_from_cache,
                 values_fetch_func=self.v_cache.fetch_from_cache)
         # Reshape the output tensor.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 29a38285b6750..f191d44ff9bc5 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from vllm.attention import AttentionMetadata, AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
@@ -169,6 +170,68 @@ def extra_repr(self) -> str:
         return s
 
 
+class MultiHeadAttention(nn.Module):
+    """Multi-headed attention without any cache, used for ViT."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = scale
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        dtype = torch.get_default_dtype()
+        attn_backend = get_attn_backend(head_size,
+                                        dtype,
+                                        kv_cache_dtype=None,
+                                        block_size=16,
+                                        is_attention_free=False)
+        if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            attn_backend = _Backend.XFORMERS
+
+        self.attn_backend = attn_backend if attn_backend in {
+            _Backend.TORCH_SDPA, _Backend.XFORMERS
+        } else _Backend.TORCH_SDPA
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+    ) -> torch.Tensor:
+        """Input shape: batch_size x seq_len x hidden_size"""
+        # TODO(Isotr0py): Use existing backend implementations and support FA2
+        bsz, q_len, _ = query.size()
+        kv_len = key.size(1)
+
+        query = query.view(bsz, q_len, self.num_heads, self.head_size)
+        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+
+        if self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+
+            out = xops.memory_efficient_attention_forward(query,
+                                                          key,
+                                                          value,
+                                                          scale=self.scale)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            query, key, value = (x.transpose(1, 2)
+                                 for x in (query, key, value))
+            out = F.scaled_dot_product_attention(query,
+                                                 key,
+                                                 value,
+                                                 scale=self.scale)
+            out = out.transpose(1, 2)
+        return out.view(bsz, q_len, -1)
+
+
 def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 464bc2af8fd6d..f002a8ff905b1 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,5 +1,6 @@
 import copy
 import dataclasses
+import time
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
 from unittest.mock import patch
@@ -14,6 +15,7 @@
 
 from .counter import compilation_counter
 from .inductor_pass import InductorPass
+from .monitor import end_monitoring_torch_compile
 from .pass_manager import PostGradPassManager
 
 logger = init_logger(__name__)
@@ -22,31 +24,54 @@
 def wrap_inductor(graph,
                   example_inputs,
                   additional_inductor_config,
-                  do_logging=False,
+                  compilation_config: CompilationConfig,
+                  graph_index: int = 0,
+                  num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
                   use_inductor: bool = True):
+    if graph_index == 0:
+        # before compiling the first graph, record the start time
+        global compilation_start_time
+        compilation_start_time = time.time()
+
     if not use_inductor:
         return graph
 
     compilation_counter.num_inductor_compilations += 1
 
-    if do_logging:
-        if runtime_shape is None:
-            logger.info("Compiling a graph for general shape")
-        else:
-            logger.info("Compiling a graph for shape %s", runtime_shape)
-
     from torch._inductor import config
-    current_config = config.shallow_copy_dict()
+    current_config = config.get_config_copy()
     from torch._inductor.compile_fx import compile_fx
 
     if additional_inductor_config is not None:
         current_config.update(additional_inductor_config)
 
+    if isinstance(runtime_shape, int):
+        # for a specific batchsize, tuning triton kernel parameters
+        # can be beneficial
+        current_config["max_autotune"] = True
+        current_config["coordinate_descent_tuning"] = True
+
     # inductor can inplace modify the graph, so we need to copy it
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
-    return compile_fx(graph, example_inputs, config_patches=current_config)
+    compiled_graph = compile_fx(graph,
+                                example_inputs,
+                                config_patches=current_config)
+
+    # after compiling the last graph, record the end time
+    if graph_index == num_graphs - 1:
+        now = time.time()
+        elapsed = now - compilation_start_time
+        compilation_config.compilation_time += elapsed
+        if runtime_shape is None:
+            logger.info("Compiling a graph for general shape takes %.2f s",
+                        elapsed)
+        else:
+            logger.info("Compiling a graph for shape %s takes %.2f s",
+                        runtime_shape, elapsed)
+
+    return compiled_graph
 
 
 @dataclasses.dataclass
@@ -108,6 +133,8 @@ def split_graph(graph: fx.GraphModule,
 # we share the global graph pool among all the backends
 global_graph_pool = None
 
+compilation_start_time = 0.0
+
 
 class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
@@ -151,12 +178,15 @@ def call_module(self, target: torch.fx.node.Target,
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
+            global compilation_start_time
             compiled_graph_for_general_shape = wrap_inductor(
                 submod,
                 args,
                 self.compilation_configs.inductor_compile_config,
+                self.compilation_configs,
+                graph_index=index,
+                num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
-                do_logging=index == 0,
                 use_inductor=self.compilation_configs.use_inductor)
 
             self.module.__dict__[target] = PiecewiseBackend(
@@ -235,17 +265,19 @@ def configure_post_pass(self):
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        # when dynamo calls the backend, it means the bytecode
+        # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
+        from .monitor import torch_compile_start_time
+        dynamo_time = time.time() - torch_compile_start_time
+        logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
+        self.compilation_configs.compilation_time += dynamo_time
 
         # we control the compilation process, each instance can only be
         # called once
         assert not self._called, "VllmBackend can only be called once"
 
         self.graph = graph
-        # config is updated now, because only here can
-        # we get the sizes to capture for cudagraph
-        # from compilation context
-        self.compilation_configs.init_during_runtime()
         self.configure_post_pass()
 
         self.split_gm, self.piecewise_graphs = split_graph(
@@ -377,6 +409,8 @@ def __init__(self, graph: fx.GraphModule,
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
+        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union(
+            self.capture_sizes)
         for shape in self.compile_sizes.union(self.capture_sizes):
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
@@ -387,6 +421,9 @@ def __init__(self, graph: fx.GraphModule,
     def __call__(self, *args) -> Any:
         if not self.first_run_finished:
             self.first_run_finished = True
+            # no specific sizes to compile
+            if self.is_last_graph and not self.to_be_compiled_sizes:
+                end_monitoring_torch_compile(self.compilation_configs)
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -401,15 +438,22 @@ def __call__(self, *args) -> Any:
 
         if entry.need_to_compile and not entry.compiled:
             entry.compiled = True
+            self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
             entry.runnable = wrap_inductor(
                 self.graph,
                 args,
                 self.compilation_configs.inductor_compile_config,
+                self.compilation_configs,
+                graph_index=self.piecewise_compile_index,
+                num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
-                do_logging=self.is_first_graph,
                 use_inductor=self.compilation_configs.use_inductor)
 
+            # finished compilations for all required shapes
+            if self.is_last_graph and not self.to_be_compiled_sizes:
+                end_monitoring_torch_compile(self.compilation_configs)
+
         if not entry.use_cudagraph:
             return entry.runnable(*args)
 
diff --git a/vllm/compilation/compile_context.py b/vllm/compilation/compile_context.py
deleted file mode 100644
index 29db3d4c637b9..0000000000000
--- a/vllm/compilation/compile_context.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from contextlib import contextmanager
-from typing import Any
-
-_compile_context: Any = None
-
-
-def get_compile_context() -> Any:
-    """Get the current compile context."""
-    return _compile_context
-
-
-@contextmanager
-def set_compile_context(context: Any):
-    """A context manager that stores the current compile context,
-    usually it is a list of sizes to specialize.
-    """
-    global _compile_context
-    prev_context = _compile_context
-    _compile_context = context
-    try:
-        yield
-    finally:
-        _compile_context = prev_context
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 8700243c9d904..938430fe2a501 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -11,6 +11,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import supports_dynamo
 
+from .monitor import start_monitoring_torch_compile
+
 logger = init_logger(__name__)
 
 _T = TypeVar("_T", bound=type[nn.Module])
@@ -143,6 +145,7 @@ def _support_torch_compile(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
+        self.vllm_config = vllm_config
         # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
         self.do_not_compile = \
@@ -181,6 +184,8 @@ def __call__(self, *args, **kwargs):
                         raise ValueError(
                             "Unsupported dynamic dimensions"
                             f" {dims} for argument {k} with type {type(arg)}.")
+            # here, it is the starting point of the `torch.compile` process
+            start_monitoring_torch_compile(self.vllm_config.compilation_config)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
new file mode 100644
index 0000000000000..3348674b09af2
--- /dev/null
+++ b/vllm/compilation/monitor.py
@@ -0,0 +1,19 @@
+import time
+
+from vllm.config import CompilationConfig, CompilationLevel
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+torch_compile_start_time: float = 0.0
+
+
+def start_monitoring_torch_compile(compilation_config: CompilationConfig):
+    global torch_compile_start_time
+    torch_compile_start_time = time.time()
+
+
+def end_monitoring_torch_compile(compilation_config: CompilationConfig):
+    if compilation_config.level == CompilationLevel.PIECEWISE:
+        logger.info("torch.compile takes %.2f s in total",
+                    compilation_config.compilation_time)
diff --git a/vllm/config.py b/vllm/config.py
index fe25ea9f76038..7bbb21333beaf 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,3 +1,4 @@
+import ast
 import copy
 import enum
 import hashlib
@@ -27,7 +28,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        is_mi250, is_navi, print_warning_once,
+                        is_mi250, is_navi, print_warning_once, random_uuid,
                         resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
@@ -511,13 +512,12 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
-        if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"):
+        if not current_platform.is_async_output_supported(self.enforce_eager):
             logger.warning(
-                "Async output processing is only supported for CUDA, TPU, XPU "
-                "and HPU."
-                "Disabling it for other platforms.")
+                "Async output processing is not supported on the "
+                "current platform type %s.", current_platform.device_type)
             self.use_async_output_proc = False
             return
 
@@ -527,22 +527,12 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-        # If the feature combo become valid
-        if device_config.device_type == "cuda" and self.enforce_eager:
-            logger.warning(
-                "To see benefits of async output processing, enable CUDA "
-                "graph. Since, enforce-eager is enabled, async output "
-                "processor cannot be used")
-            self.use_async_output_proc = not self.enforce_eager
-            return
-
         # Async postprocessor is not necessary with embedding mode
         # since there is no token generation
         if self.task == "embedding":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
@@ -933,7 +923,9 @@ def __post_init__(self):
         if isinstance(model_loader_extra_config, str):
             self.model_loader_extra_config = json.loads(
                 model_loader_extra_config)
-        self._verify_load_format()
+        if isinstance(self.load_format, str):
+            load_format = self.load_format.lower()
+            self.load_format = LoadFormat(load_format)
 
         if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
             logger.info(
@@ -942,25 +934,6 @@ def __post_init__(self):
         else:
             self.ignore_patterns = ["original/**/*"]
 
-    def _verify_load_format(self) -> None:
-        if not isinstance(self.load_format, str):
-            return
-
-        load_format = self.load_format.lower()
-        self.load_format = LoadFormat(load_format)
-
-        rocm_not_supported_load_format: List[str] = []
-        if current_platform.is_rocm(
-        ) and load_format in rocm_not_supported_load_format:
-            rocm_supported_load_format = [
-                f for f in LoadFormat.__members__
-                if (f not in rocm_not_supported_load_format)
-            ]
-            raise ValueError(
-                f"load format '{load_format}' is not supported in ROCm. "
-                f"Supported load formats are "
-                f"{rocm_supported_load_format}")
-
 
 @dataclass
 class ParallelConfig:
@@ -1748,7 +1721,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
@@ -1816,15 +1789,15 @@ class PoolerConfig:
 
     step_tag_id: Optional[int] = None
     """
-    If set, only the score corresponding to the ``step_tag_id`` in the 
+    If set, only the score corresponding to the ``step_tag_id`` in the
     generated sentence should be returned. Otherwise, the scores for all tokens
     are returned.
     """
 
     returned_token_ids: Optional[List[int]] = None
     """
-    A list of indices for the vocabulary dimensions to be extracted, 
-    such as the token IDs of ``good_token`` and ``bad_token`` in the 
+    A list of indices for the vocabulary dimensions to be extracted,
+    such as the token IDs of ``good_token`` and ``bad_token`` in the
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
@@ -2058,11 +2031,12 @@ def get_served_model_name(model: str,
 class DecodingConfig:
     """Dataclass which contains the decoding strategy of the engine"""
 
-    # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
-    guided_decoding_backend: str = 'outlines'
+    # Which guided decoding algo to use.
+    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
+    guided_decoding_backend: str = 'xgrammar'
 
     def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer']
+        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
         backend = self.guided_decoding_backend
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend},"
@@ -2125,7 +2099,7 @@ class KVTransferConfig(BaseModel):
 
     @classmethod
     def from_cli(cls, cli_value: str) -> "KVTransferConfig":
-        """Parse the CLI value for the compilation config."""
+        """Parse the CLI value for the kv cache transfer config."""
         return KVTransferConfig.model_validate_json(cli_value)
 
     def model_post_init(self, __context: Any) -> None:
@@ -2218,8 +2192,8 @@ class CompilationConfig(BaseModel):
             TODO: move outside cudagraph logic into compilation.
             torch.compile will handle cudagraph capture logic in the future.
         - cudagraph_capture_sizes: sizes to capture cudagraph.
-            - None: capture sizes are inferred from compilation context.
-            - List[int]: capture sizes are specified.
+            - None (default): capture sizes are inferred from vllm config.
+            - List[int]: capture sizes are specified as given.
         - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
             It means the first several runs will be treated as warmup runs.
             Only after that, the execution will be recorded, and the recorded
@@ -2233,14 +2207,10 @@ class CompilationConfig(BaseModel):
         - use_inductor: whether to use inductor compilation.
             - False: inductor compilation is not used. graph runs in eager.
             - True: inductor compilation is used. one graph for symbolic shape
-                is compiled. In addition, compile for different sizes specified
-                in inductor_compile_sizes, using configurations
+                is compiled. In addition, compile for cudagraph sizes that are
+                in candidate_compile_sizes, using configurations
                 in inductor_compile_config.
-        - inductor_compile_sizes: sizes to compile for inductor.
-        - inductor_specialize_for_cudagraph_no_more_than: an optional integer
-            to specialize inductor for cudagraph sizes no more than the
-            specified size. It is useful when we want to specialize inductor
-            with a subset of cudagraph sizes.
+        - candidate_compile_sizes: sizes to compile for inductor.
         - inductor_compile_config: additional configurations for inductor.
             - None: use default configurations.
         - inductor_passes: additional passes for inductor. It is a dictionary
@@ -2249,7 +2219,7 @@ class CompilationConfig(BaseModel):
             from Python, functions can also be passed directly via Python object
             constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
         - custom inductor passes: see PassConfig for more details
-    
+
     Why we have different sizes for cudagraph and inductor:
     - cudagraph: a cudagraph captured for a specific size can only be used
         for the same size. We need to capture all the sizes we want to use.
@@ -2269,8 +2239,7 @@ class CompilationConfig(BaseModel):
     ])
 
     use_inductor: bool = True
-    inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None
-    inductor_compile_sizes: Optional[List[int]] = Field(default=None)
+    candidate_compile_sizes: Optional[List[int]] = Field(default=None)
     inductor_compile_config: Dict = Field(default_factory=dict)
     inductor_passes: Dict[str, str] = Field(default_factory=dict)
 
@@ -2324,6 +2293,7 @@ def model_post_init(self, __context: Any) -> None:
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
+    compilation_time: float = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
@@ -2335,7 +2305,9 @@ def from_cli(cls, cli_value: str) -> "CompilationConfig":
         """Parse the CLI value for the compilation config."""
         if cli_value in ["0", "1", "2", "3"]:
             return cls(level=int(cli_value))
-        return CompilationConfig.model_validate_json(cli_value)
+        # do not use `eval`, it is dangerous and can execute arbitrary code
+        dict_value = ast.literal_eval(cli_value)
+        return CompilationConfig.model_validate(dict_value)
 
     def model_post_init(self, __context: Any) -> None:
 
@@ -2362,6 +2334,7 @@ def model_post_init(self, __context: Any) -> None:
         self.enabled_custom_ops = Counter()
         self.disabled_custom_ops = Counter()
         self.static_forward_context = {}
+        self.compilation_time = 0.0
 
     def init_backend(self) -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
@@ -2384,15 +2357,10 @@ def init_backend(self) -> Union[str, Callable]:
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(self)
 
-    def init_during_runtime(self):
+    def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
         """To complete the initialization of config,
-        we need to know the compile context, which is only available
-        during the first run of the model.
-        """
-        from vllm.compilation.compile_context import get_compile_context
-        context = get_compile_context()
-        context = copy.deepcopy(context) if context is not None else []
-        sizes_to_specialize: List[int] = context
+        we need to know the cudagraph sizes."""
+
         if self.cudagraph_capture_sizes is None:
             self.capture_sizes = sizes_to_specialize
         else:
@@ -2400,18 +2368,35 @@ def init_during_runtime(self):
             logger.info(("cudagraph sizes specified by model runner"
                          " %s is overridden by config %s"),
                         sizes_to_specialize, self.cudagraph_capture_sizes)
-        if self.inductor_specialize_for_cudagraph_no_more_than is not None:
-            assert self.inductor_compile_sizes is None, (
-                "inductor_compile_sizes should be None when "
-                "inductor_specialize_for_cudagraph_no_more_than is not None")
-            self.compile_sizes = [
-                x for x in self.capture_sizes
-                if x <= self.inductor_specialize_for_cudagraph_no_more_than
-            ]
-        else:
-            if self.inductor_compile_sizes is None:
-                self.inductor_compile_sizes = []
-            self.compile_sizes = self.inductor_compile_sizes
+
+        if self.candidate_compile_sizes is None:
+            self.candidate_compile_sizes = []
+        self.compile_sizes = [
+            x for x in self.candidate_compile_sizes if x in self.capture_sizes
+        ]
+        ignored_sizes = [
+            x for x in self.candidate_compile_sizes
+            if x not in self.capture_sizes
+        ]
+        if ignored_sizes:
+            logger.warning(("candidate_compile_sizes %s are ignored "
+                            "because they are not cudagraph capture sizes."),
+                           ignored_sizes)
+
+        # sort to make sure cudagraph capture sizes are in descending order
+        self.capture_sizes.sort(reverse=True)
+
+
+_BATCH_SIZE_ALIGNMENT = 8
+# all the token sizes that **can** be captured by cudagraph.
+# they can be arbitrarily large.
+# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192.
+# the actual sizes to capture will be determined by the model,
+# depending on the model's max_num_seqs.
+# NOTE: get_graph_batch_size needs to be updated if this list is changed.
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)
+]
 
 
 @dataclass
@@ -2439,6 +2424,42 @@ class VllmConfig:
                                                   init=True)  # type: ignore
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
+    instance_id: str = ""
+
+    @staticmethod
+    def get_graph_batch_size(batch_size: int) -> int:
+        """Returns the padded batch size given actual batch size.
+
+        Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
+        2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
+        """
+        if batch_size <= 2:
+            return batch_size
+        elif batch_size <= 4:
+            return 4
+        else:
+            return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
+                    _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
+
+    @staticmethod
+    def get_max_graph_batch_size(max_num_seqs: int) -> int:
+        """
+        max_num_seqs: Maximum number of sequences in a batch.
+        _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture.
+
+        pad the max_num_seqs if necessary by calling get_graph_batch_size,
+        which will deal with some edge cases like 1, 2, 4.
+
+        if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded
+        size. if not, it means the padded size is larger than the largest size
+        in _BATCH_SIZES_TO_CAPTURE, return the largest size in
+        _BATCH_SIZES_TO_CAPTURE.
+        """
+        padded_size = VllmConfig.get_graph_batch_size(max_num_seqs)
+        if padded_size in _BATCH_SIZES_TO_CAPTURE:
+            return padded_size
+        assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
+        return _BATCH_SIZES_TO_CAPTURE[-1]
 
     @staticmethod
     def _get_quantization_config(
@@ -2468,7 +2489,15 @@ def _get_quantization_config(
             return quant_config
         return None
 
-    def with_hf_config(self, hf_config: PretrainedConfig) -> "VllmConfig":
+    def with_hf_config(
+        self,
+        hf_config: PretrainedConfig,
+        architectures: Optional[list[str]] = None,
+    ) -> "VllmConfig":
+        if architectures is not None:
+            hf_config = copy.deepcopy(hf_config)
+            hf_config.architectures = architectures
+
         model_config = copy.deepcopy(self.model_config)
         model_config.hf_config = hf_config
 
@@ -2519,10 +2548,33 @@ def __post_init__(self):
             self.compilation_config.custom_ops = ["none"]
             self.compilation_config.use_cudagraph = True
             self.compilation_config.use_inductor = True
+            self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_reshape = False
             self.compilation_config.level = CompilationLevel.PIECEWISE
 
+        if not envs.VLLM_USE_V1:
+            max_batchsize_to_capture = 0
+            if self.scheduler_config is not None and \
+                self.model_config is not None and \
+                    not self.model_config.enforce_eager:
+                max_batchsize_to_capture = \
+                    self.get_max_graph_batch_size(
+                    self.scheduler_config.max_num_seqs)
+            batch_size_capture_list = [
+                size for size in _BATCH_SIZES_TO_CAPTURE
+                if size <= max_batchsize_to_capture
+            ]
+        else:
+            batch_size_capture_list = []
+            if self.model_config is not None and \
+                not self.model_config.enforce_eager:
+                batch_size_capture_list = [1, 2, 4
+                                           ] + [i for i in range(8, 513, 8)]
+
+        self.compilation_config.init_with_cudagraph_sizes(
+            batch_size_capture_list)
+
         if self.cache_config is not None and \
             self.cache_config.cpu_offload_gb > 0 and \
             self.compilation_config.level != CompilationLevel.NO_COMPILATION:
@@ -2539,46 +2591,44 @@ def __post_init__(self):
 
         current_platform.check_and_update_config(self)
 
+        if not self.instance_id:
+            self.instance_id = random_uuid()[:5]
+
     def __str__(self):
-        return ("model=%r, speculative_config=%r, tokenizer=%r, "
-        "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-        "override_neuron_config=%s, tokenizer_revision=%s, "
-        "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
-        "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
-        "pipeline_parallel_size=%d, "
-        "disable_custom_all_reduce=%s, quantization=%s, "
-        "enforce_eager=%s, kv_cache_dtype=%s, "
-        "quantization_param_path=%s, device_config=%s, "
-        "decoding_config=%r, observability_config=%r, "
-        "seed=%d, served_model_name=%s, "
-        "num_scheduler_steps=%d, enable_prefix_caching=%s, "
-        "use_async_output_proc=%s, mm_processor_kwargs=%s") % \
-        (self.model_config.model, self.speculative_config,
-        self.model_config.tokenizer,
-        self.model_config.skip_tokenizer_init,
-        self.model_config.tokenizer_mode,
-        self.model_config.revision,
-        self.model_config.override_neuron_config,
-        self.model_config.tokenizer_revision,
-        self.model_config.trust_remote_code,
-        self.model_config.dtype,
-        self.model_config.max_model_len,
-        self.load_config.download_dir,
-        self.load_config.load_format,
-        self.parallel_config.tensor_parallel_size,
-        self.parallel_config.pipeline_parallel_size,
-        self.parallel_config.disable_custom_all_reduce,
-        self.model_config.quantization,
-        self.model_config.enforce_eager,
-        self.cache_config.cache_dtype,
-        self.model_config.quantization_param_path,
-        self.device_config.device, self.decoding_config,
-        self.observability_config, self.model_config.seed,
-        self.model_config.served_model_name,
-        self.scheduler_config.num_scheduler_steps,
-        self.cache_config.enable_prefix_caching,
-        self.model_config.use_async_output_proc,
-        self.model_config.mm_processor_kwargs)
+        return (
+            f"model={self.model_config.model!r},"
+            f" speculative_config={self.speculative_config!r},"
+            f" tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
+            f" tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"revision={self.model_config.revision}, "
+            f"override_neuron_config={self.model_config.override_neuron_config},"
+            f" tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"trust_remote_code={self.model_config.trust_remote_code}, "
+            f"dtype={self.model_config.dtype}, "
+            f"max_seq_len={self.model_config.max_model_len},"
+            f" download_dir={self.load_config.download_dir!r}, "
+            f"load_format={self.load_config.load_format}, "
+            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
+            f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
+            f"quantization={self.model_config.quantization}, "
+            f"enforce_eager={self.model_config.enforce_eager}, "
+            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
+            f"quantization_param_path={self.model_config.quantization_param_path},"
+            f" device_config={self.device_config.device}, "
+            f"decoding_config={self.decoding_config!r}, "
+            f"observability_config={self.observability_config!r}, "
+            f"seed={self.model_config.seed}, "
+            f"served_model_name={self.model_config.served_model_name}, "
+            f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, "
+            f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, "  # noqa
+            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
+            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
+            f"use_async_output_proc={self.model_config.use_async_output_proc}, "
+            f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
+            f"pooler_config={self.model_config.pooler_config!r},"
+            f" compilation_config={self.compilation_config!r}")
 
 
 _current_vllm_config: Optional[VllmConfig] = None
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index d4e3f81747038..a6800f93f167b 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -197,6 +197,25 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
 
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = self.stream
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
+                                ncclDataTypeEnum.from_torch(tensor.dtype), src,
+                                self.comm, cudaStream_t(stream.cuda_stream))
+
     @contextmanager
     def change_state(self,
                      enable: Optional[bool] = None,
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index ff88f72470b27..7dea61b6a09f1 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -189,6 +189,15 @@ class NCCLLibrary:
             ncclComm_t, cudaStream_t
         ]),
 
+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function("ncclBroadcast", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ctypes.c_int, ncclComm_t, cudaStream_t
+        ]),
+
         # be cautious! this is a collective call, it will block until all
         # processes in the communicator have called this function.
         # because Python object destruction can happen in random order,
@@ -312,6 +321,13 @@ def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
         self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
                                                 comm, stream))
 
+    def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, root: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count,
+                                                     datatype, root, comm,
+                                                     stream))
+
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4aa0eebd976c9..3db069ec64ee4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -168,7 +168,7 @@ class EngineArgs:
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: Optional[bool] = None
 
-    guided_decoding_backend: str = 'outlines'
+    guided_decoding_backend: str = 'xgrammar'
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
     speculative_model_quantization: Optional[str] = None
@@ -209,12 +209,9 @@ def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
-        if isinstance(self.compilation_config, (int)):
+        if isinstance(self.compilation_config, (int, dict)):
             self.compilation_config = CompilationConfig.from_cli(
                 str(self.compilation_config))
-        elif isinstance(self.compilation_config, (dict)):
-            self.compilation_config = CompilationConfig.from_cli(
-                json.dumps(self.compilation_config))
 
         # Setup plugins
         from vllm.plugins import load_general_plugins
@@ -364,11 +361,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--guided-decoding-backend',
             type=str,
-            default='outlines',
-            choices=['outlines', 'lm-format-enforcer'],
+            default='xgrammar',
+            choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/outlines-dev/outlines and '
+            'https://github.com/outlines-dev/outlines,'
+            'https://github.com/mlc-ai/xgrammar, and '
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
             ' parameter.')
@@ -432,6 +430,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             'capping to sliding window size')
         parser.add_argument('--use-v2-block-manager',
                             action='store_true',
+                            default=True,
                             help='[DEPRECATED] block manager v1 has been '
                             'removed and SelfAttnBlockSpaceManager (i.e. '
                             'block manager v2) is now the default. '
@@ -1051,9 +1050,12 @@ def create_engine_config(self,
             # long context (> 32K) models. This is to avoid OOM errors in the
             # initial memory profiling phase.
 
-            # Chunked prefill is currently disabled for multimodal models by
-            # default.
-            if use_long_context and not model_config.is_multimodal_model:
+            # For multimodal models, chunked prefill is disabled by default in
+            # V0, but enabled by design in V1
+            if model_config.is_multimodal_model:
+                self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
+
+            elif use_long_context:
                 is_gpu = device_config.device_type == "cuda"
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
@@ -1110,7 +1112,7 @@ def create_engine_config(self,
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
@@ -1242,12 +1244,9 @@ def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
         Override the EngineConfig's configs based on the usage context for V1.
         """
         assert envs.VLLM_USE_V1, "V1 is not enabled"
-        # TODO (ywang96): Enable APC by default when VLM supports it.
         if engine_config.model_config.is_multimodal_model:
-            logger.warning(
-                "Prefix caching is currently not supported for multimodal "
-                "models and has been disabled.")
-            engine_config.cache_config.enable_prefix_caching = False
+            # TODO (ywang96): Enable APC by default when VLM supports it.
+            assert not engine_config.cache_config.enable_prefix_caching
 
 
 @dataclass
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7b1bb7b05708d..60dccd7a0812c 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import time
 import weakref
 from functools import partial
@@ -6,6 +7,8 @@
                     List, Mapping, Optional, Set, Tuple, Type, Union, overload)
 from weakref import ReferenceType
 
+from typing_extensions import deprecated
+
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VllmConfig)
@@ -422,7 +425,8 @@ async def get_tokenizer_async(self,
         return await (
             self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
 
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     async def add_request_async(
         self,
         request_id: str,
@@ -504,7 +508,8 @@ async def add_request_async(
                 sampling_params=params,
                 tokenizer=await self.get_tokenizer_async(lora_request),
                 default_guided_backend=self.decoding_config.
-                guided_decoding_backend)
+                guided_decoding_backend,
+                model_config=self.model_config)
 
         self._add_processed_request(
             request_id=request_id,
@@ -525,22 +530,30 @@ async def check_health_async(self) -> None:
 
 async def build_guided_decoding_logits_processor_async(
         sampling_params: SamplingParams, tokenizer: AnyTokenizer,
-        default_guided_backend: str) -> SamplingParams:
+        default_guided_backend: str,
+        model_config: ModelConfig) -> SamplingParams:
     """Constructs logits processors based on the guided_decoding,
     logits_bias, and allowed_token_ids fields in sampling_params. Deletes
     those fields and adds the constructed logits processors to the
     logits_processors field. Modifies sampling params in-place and returns
     the modified sampling params."""
-    if (guided_decoding := sampling_params.guided_decoding) is None:
+    if sampling_params.guided_decoding is None:
         return sampling_params
 
+    # Defensively copy sampling params since guided decoding logits
+    # processors can have different state for each request
+    sampling_params = copy.copy(sampling_params)
+    guided_decoding = sampling_params.guided_decoding
+
     logger.debug("Building guided decoding logits processor. "
                  "Params: %s", guided_decoding)
 
     guided_decoding.backend = guided_decoding.backend or default_guided_backend
 
     processor = await get_guided_decoding_logits_processor(
-        guided_params=guided_decoding, tokenizer=tokenizer)
+        guided_params=guided_decoding,
+        tokenizer=tokenizer,
+        model_config=model_config)
 
     if processor:
         if sampling_params.logits_processors is None:
@@ -894,7 +907,8 @@ async def run_engine_loop(engine_ref: ReferenceType):
 
     # This method does not need to be async, but kept that way
     # for backwards compatibility.
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def add_request(
         self,
         request_id: str,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d2d50b3238daa..336b61f4daec4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,3 +1,4 @@
+import copy
 import time
 from collections import Counter as collectionsCounter
 from collections import deque
@@ -10,7 +11,7 @@
 from typing import Set, Type, Union, cast, overload
 
 import torch
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@@ -246,60 +247,12 @@ def __init__(
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: "
-            "model=%r, speculative_config=%r, tokenizer=%r, "
-            "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-            "override_neuron_config=%s, tokenizer_revision=%s, "
-            "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
-            "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
-            "pipeline_parallel_size=%d, "
-            "disable_custom_all_reduce=%s, quantization=%s, "
-            "enforce_eager=%s, kv_cache_dtype=%s, "
-            "quantization_param_path=%s, device_config=%s, "
-            "decoding_config=%r, observability_config=%r, "
-            "seed=%d, served_model_name=%s, "
-            "num_scheduler_steps=%d, chunked_prefill_enabled=%s "
-            "multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
-            "use_async_output_proc=%s, use_cached_outputs=%s, "
-            "mm_processor_kwargs=%s, pooler_config=%r,"
-            "compilation_config=%r",
+            "Initializing an LLM engine (v%s) with config: %r,"
+            "use_cached_outputs=%s, ",
             VLLM_VERSION,
-            self.model_config.model,
-            self.speculative_config,
-            self.model_config.tokenizer,
-            self.model_config.skip_tokenizer_init,
-            self.model_config.tokenizer_mode,
-            self.model_config.revision,
-            self.model_config.override_neuron_config,
-            self.model_config.tokenizer_revision,
-            self.model_config.trust_remote_code,
-            self.model_config.dtype,
-            self.model_config.max_model_len,
-            self.load_config.download_dir,
-            self.load_config.load_format,
-            self.parallel_config.tensor_parallel_size,
-            self.parallel_config.pipeline_parallel_size,
-            self.parallel_config.disable_custom_all_reduce,
-            self.model_config.quantization,
-            self.model_config.enforce_eager,
-            self.cache_config.cache_dtype,
-            self.model_config.quantization_param_path,
-            self.device_config.device,
-            self.decoding_config,
-            self.observability_config,
-            self.model_config.seed,
-            self.model_config.served_model_name,
-            self.scheduler_config.num_scheduler_steps,
-            self.scheduler_config.chunked_prefill_enabled,
-            self.scheduler_config.multi_step_stream_outputs,
-            self.cache_config.enable_prefix_caching,
-            self.model_config.use_async_output_proc,
+            vllm_config,
             use_cached_outputs,
-            self.model_config.mm_processor_kwargs,
-            self.model_config.pooler_config,
-            vllm_config.compilation_config,
         )
-        # TODO(woosuk): Print more configs in debug mode.
 
         self.log_stats = log_stats
         self.use_cached_outputs = use_cached_outputs
@@ -472,6 +425,7 @@ def _initialize_kv_caches(self) -> None:
         The workers will determine the number of blocks in both the GPU cache
         and the swap CPU cache.
         """
+        start = time.time()
         num_gpu_blocks, num_cpu_blocks = (
             self.model_executor.determine_num_available_blocks())
 
@@ -487,6 +441,9 @@ def _initialize_kv_caches(self) -> None:
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
         self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+        elapsed = time.time() - start
+        logger.info(("init engine (profile, create kv cache, "
+                     "warmup model) took %.2f seconds"), elapsed)
 
     @classmethod
     def _get_executor_cls(cls,
@@ -619,7 +576,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup:
             model_config=self.model_config,
             scheduler_config=self.scheduler_config,
             parallel_config=self.parallel_config,
-            enable_lora=bool(self.lora_config))
+            lora_config=self.lora_config)
 
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
@@ -719,7 +676,8 @@ def _add_processed_request(
     def stop_remote_worker_execution_loop(self) -> None:
         self.model_executor.stop_remote_worker_execution_loop()
 
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def add_request(
         self,
         request_id: str,
@@ -1023,9 +981,9 @@ def _update_num_computed_tokens_for_multi_step_prefill(
         This function updates num_computed_tokens for prompt sequences
         when Multi-Step is enabled.
 
-        seq_group: SequenceGroup to update the num_computed_tokens for. 
+        seq_group: SequenceGroup to update the num_computed_tokens for.
         seq_group_meta: Metadata of the given SequenceGroup.
-        is_first_step_output: Optional[bool] - 
+        is_first_step_output: Optional[bool] -
             When available, is_first_step_output indicates if the appended
             output token is the output of the first-step in multi-step.
             A value of None indicates that outputs from all steps in
@@ -2035,7 +1993,11 @@ def _build_logits_processors(
 
         logits_processors = []
 
-        if (guided_decoding := sampling_params.guided_decoding) is not None:
+        if sampling_params.guided_decoding is not None:
+            # Defensively copy sampling params since guided decoding logits
+            # processors can have different state for each request
+            sampling_params = copy.copy(sampling_params)
+            guided_decoding = sampling_params.guided_decoding
 
             logger.debug(
                 "Building guided decoding logits processor in "
@@ -2046,7 +2008,9 @@ def _build_logits_processors(
                 self.decoding_config.guided_decoding_backend
 
             processor = get_local_guided_decoding_logits_processor(
-                guided_params=guided_decoding, tokenizer=tokenizer)
+                guided_params=guided_decoding,
+                tokenizer=tokenizer,
+                model_config=self.model_config)
             if processor:
                 logits_processors.append(processor)
 
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 4869557ba9b44..a5ae21c3966a7 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -599,9 +599,9 @@ def _log_prometheus(self, stats: Stats) -> None:
                             stats.time_queue_requests)
         self._log_histogram(self.metrics.histogram_inference_time_request,
                             stats.time_inference_requests)
-        self._log_histogram(self.metrics.histogram_decode_time_request,
-                            stats.time_prefill_requests)
         self._log_histogram(self.metrics.histogram_prefill_time_request,
+                            stats.time_prefill_requests)
+        self._log_histogram(self.metrics.histogram_decode_time_request,
                             stats.time_decode_requests)
         self._log_histogram(self.metrics.histogram_time_in_queue_request,
                             stats.time_in_queue_requests)
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 34c161e9395ae..7020012e8bb86 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -2,6 +2,8 @@
 from enum import Enum
 from typing import List, Mapping, Optional, Union, overload
 
+from typing_extensions import deprecated
+
 from vllm import PoolingParams
 from vllm.inputs import PromptType
 from vllm.lora.request import LoRARequest
@@ -32,7 +34,8 @@ class RPCProcessRequest:
     prompt_adapter_request: Optional[PromptAdapterRequest] = None
     priority: int = 0
 
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def __init__(
         self,
         *,
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index d26728e8c6e67..7e4f81b2cf8e2 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -9,6 +9,7 @@
 import psutil
 import zmq
 import zmq.asyncio
+from typing_extensions import deprecated
 from zmq import Frame  # type: ignore[attr-defined]
 from zmq.asyncio import Socket
 
@@ -93,8 +94,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
             model_config=self.model_config,
             scheduler_config=engine_config.scheduler_config,
             parallel_config=engine_config.parallel_config,
-            enable_lora=bool(engine_config.lora_config),
-        )
+            lora_config=engine_config.lora_config)
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer)
 
@@ -414,7 +414,8 @@ def errored(self) -> bool:
     def dead_error(self) -> BaseException:
         return ENGINE_DEAD_ERROR(self._errored_with)
 
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def generate(
         self,
         *,
@@ -472,8 +473,8 @@ def generate(
             trace_headers: OpenTelemetry trace headers.
             prompt_adapter_request: Prompt Adapter request to use
                                             for generation, if any.
-            priority: Priority of the request (lower means earlier handling). 
-                Any priority other than 0 will lead to an error if the 
+            priority: Priority of the request (lower means earlier handling).
+                Any priority other than 0 will lead to an error if the
                 scheduling policy is not "priority".
         """
         if inputs is not None:
@@ -485,7 +486,8 @@ def generate(
                                      lora_request, trace_headers,
                                      prompt_adapter_request, priority)
 
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def encode(
         self,
         *,
@@ -586,6 +588,7 @@ async def _process_request(
                     default_guided_backend=(self.decoding_config.guided_decoding_backend
                         if self.decoding_config
                         else DecodingConfig.guided_decoding_backend),
+                    model_config=self.model_config
                 )
 
         # 1) Create output queue for this requests.
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 7a6ebb430541f..a9b638ed02a1e 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index a25c401b4ea10..8de30ccd18a11 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,11 +1,11 @@
 import itertools
-import json
 import warnings
 from contextlib import contextmanager
 from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type,
                     Union, cast, overload)
 
 from tqdm import tqdm
+from typing_extensions import deprecated
 
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
@@ -185,12 +185,9 @@ def __init__(
             kwargs["disable_log_stats"] = True
 
         if compilation_config is not None:
-            if isinstance(compilation_config, (int)):
+            if isinstance(compilation_config, (int, dict)):
                 compilation_config_instance = CompilationConfig.from_cli(
                     str(compilation_config))
-            elif isinstance(compilation_config, (dict)):
-                compilation_config_instance = CompilationConfig.from_cli(
-                    json.dumps(compilation_config))
             else:
                 compilation_config_instance = compilation_config
         else:
@@ -256,6 +253,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
     @overload  # LEGACY: single (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def generate(
         self,
         prompts: str,
@@ -268,6 +266,7 @@ def generate(
         ...
 
     @overload  # LEGACY: multi (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def generate(
         self,
         prompts: List[str],
@@ -280,6 +279,7 @@ def generate(
         ...
 
     @overload  # LEGACY: single (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def generate(
         self,
         prompts: Optional[str] = None,
@@ -293,6 +293,7 @@ def generate(
         ...
 
     @overload  # LEGACY: multi (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def generate(
         self,
         prompts: Optional[List[str]] = None,
@@ -306,6 +307,7 @@ def generate(
         ...
 
     @overload  # LEGACY: single or multi token ids [pos-only]
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def generate(
         self,
         prompts: None,
@@ -671,6 +673,7 @@ def chat(
         )
 
     @overload  # LEGACY: single (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def encode(
         self,
         prompts: str,
@@ -683,6 +686,7 @@ def encode(
         ...
 
     @overload  # LEGACY: multi (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def encode(
         self,
         prompts: List[str],
@@ -695,6 +699,7 @@ def encode(
         ...
 
     @overload  # LEGACY: single (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def encode(
         self,
         prompts: Optional[str] = None,
@@ -708,6 +713,7 @@ def encode(
         ...
 
     @overload  # LEGACY: multi (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def encode(
         self,
         prompts: Optional[List[str]] = None,
@@ -721,6 +727,7 @@ def encode(
         ...
 
     @overload  # LEGACY: single or multi token ids [pos-only]
+    @deprecated("'prompt_token_ids' will become part of 'prompts")
     def encode(
         self,
         prompts: None,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6bc31ef83ded4..c7bc30040279c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -175,8 +175,8 @@ async def build_async_engine_client_from_engine_args(
 
         # Select random path for IPC.
         ipc_path = get_open_zmq_ipc_path()
-        logger.info("Multiprocessing frontend to use %s for IPC Path.",
-                    ipc_path)
+        logger.debug("Multiprocessing frontend to use %s for IPC Path.",
+                     ipc_path)
 
         # Start RPCServer in separate process (holds the LLMEngine).
         # the current process might have CUDA context,
@@ -249,8 +249,8 @@ def mount_metrics(app: FastAPI):
 
     prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
     if prometheus_multiproc_dir_path is not None:
-        logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
-                    prometheus_multiproc_dir_path)
+        logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
+                     prometheus_multiproc_dir_path)
         registry = CollectorRegistry()
         multiprocess.MultiProcessCollector(registry)
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 3e458f9b6f25f..46369e69c35f3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -8,7 +8,6 @@
     VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
     VLLM_USE_MODELSCOPE: bool = False
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
-    VLLM_INSTANCE_ID: Optional[str] = None
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_ROCM_PREFER_TORCH: bool = False
@@ -125,7 +124,8 @@ def get_default_config_root():
 
     # If set, vllm will use precompiled binaries (*.so)
     "VLLM_USE_PRECOMPILED":
-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
+        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
 
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
@@ -186,11 +186,6 @@ def get_default_config_root():
     "VLLM_USE_MODELSCOPE":
     lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
 
-    # Instance id represents an instance of the VLLM. All processes in the same
-    # instance should have the same instance id.
-    "VLLM_INSTANCE_ID":
-    lambda: os.environ.get("VLLM_INSTANCE_ID", None),
-
     # Interval in seconds to log a warning message when the ring buffer is full
     "VLLM_RINGBUFFER_WARNING_INTERVAL":
     lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 336f9bc8efb20..2816b5c5c1f88 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -10,8 +10,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id, make_async)
+from vllm.utils import get_distributed_init_method, get_open_port, make_async
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -23,7 +22,7 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
@@ -31,9 +30,6 @@ def _init_executor(self) -> None:
         # Environment variables for CPU executor
         #
 
-        # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
-        os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
-
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index a6c05a71d2b6f..c450209f0eb91 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -16,7 +16,7 @@
 from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
                         cuda_is_initialized, get_distributed_init_method,
-                        get_open_port, get_vllm_instance_id, make_async,
+                        get_open_port, make_async,
                         update_environment_variables)
 
 if HAS_TRITON:
@@ -37,9 +37,6 @@ def _init_executor(self) -> None:
         world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
 
-        # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
-        os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
-
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 6542b18ae70b1..4263fb27265f6 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -15,8 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, get_vllm_instance_id,
-                        make_async)
+                        get_ip, get_open_port, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -189,8 +188,14 @@ def sort_by_driver_then_worker_ip(worker):
         self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
 
         # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
 
         node_workers = defaultdict(list)  # node id -> list of worker ranks
         node_gpus = defaultdict(list)  # node id -> list of gpu ids
@@ -220,14 +225,10 @@ def sort_by_driver_then_worker_ip(worker):
                 " environment variable, make sure it is unique for"
                 " each node.")
 
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
-
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
             "CUDA_VISIBLE_DEVICES":
             ",".join(map(str, node_gpus[node_id])),
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
             **({
@@ -334,7 +335,6 @@ def _run_workers(
         async_run_tensor_parallel_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
@@ -394,18 +394,10 @@ def _run_workers(
             driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
 
             # Start the driver worker after all the ray workers.
-            if not use_dummy_driver:
-                driver_worker_output = [
-                    self.driver_worker.execute_method(method, *driver_args,
-                                                      **driver_kwargs)
-                ]
-            else:
-                assert self.driver_dummy_worker is not None
-                driver_worker_output = [
-                    ray.get(
-                        self.driver_dummy_worker.execute_method.remote(
-                            method, *driver_args, **driver_kwargs))
-                ]
+            driver_worker_output = [
+                self.driver_worker.execute_method(method, *driver_args,
+                                                  **driver_kwargs)
+            ]
 
         # Get the results of the ray workers.
         if self.workers:
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
index a74328e5aa272..f3025cb537ab8 100644
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -15,8 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, get_vllm_instance_id,
-                        make_async)
+                        get_ip, get_open_port, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -164,9 +163,14 @@ def sort_by_driver_then_worker_ip(worker):
         # node will be placed first.
         self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
 
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
 
         node_workers = defaultdict(list)  # node id -> list of worker ranks
         node_gpus = defaultdict(list)  # node id -> list of gpu ids
@@ -196,12 +200,8 @@ def sort_by_driver_then_worker_ip(worker):
                 "environment variable, make sure it is unique for"
                 " each node.")
 
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
-
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
         }, ) for (node_id, _) in worker_node_and_gpu_ids]
@@ -301,7 +301,6 @@ def _run_workers(
         async_run_tensor_parallel_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
@@ -361,18 +360,10 @@ def _run_workers(
             driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
 
             # Start the driver worker after all the ray workers.
-            if not use_dummy_driver:
-                driver_worker_output = [
-                    self.driver_worker.execute_method(method, *driver_args,
-                                                      **driver_kwargs)
-                ]
-            else:
-                assert self.driver_dummy_worker is not None
-                driver_worker_output = [
-                    ray.get(
-                        self.driver_dummy_worker.execute_method.remote(
-                            method, *driver_args, **driver_kwargs))
-                ]
+            driver_worker_output = [
+                self.driver_worker.execute_method(method, *driver_args,
+                                                  **driver_kwargs)
+            ]
 
         # Get the results of the ray workers.
         if self.workers:
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
index c227b5e283c68..5118c13934f0d 100644
--- a/vllm/executor/ray_tpu_executor.py
+++ b/vllm/executor/ray_tpu_executor.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+                        make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -137,19 +137,21 @@ def sort_by_driver_then_worker_ip(worker):
         self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
 
         # Get the set of TPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
 
         node_workers = defaultdict(list)
         for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
             node_workers[node_id].append(i)
 
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
-
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
         }, ) for _ in worker_node_and_gpu_ids]
@@ -203,7 +205,6 @@ def _run_workers(
         async_run_remote_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         use_ray_compiled_dag: bool = False,
         **kwargs,
@@ -245,14 +246,8 @@ def _run_workers(
         driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
 
         # Start the driver worker after all the ray workers.
-        if not use_dummy_driver:
-            driver_worker_output = self.driver_worker.execute_method(
-                method, *driver_args, **driver_kwargs)
-        else:
-            assert self.driver_dummy_worker is not None
-            driver_worker_output = ray.get(
-                self.driver_dummy_worker.execute_method.remote(
-                    method, *driver_args, **driver_kwargs))
+        driver_worker_output = self.driver_worker.execute_method(
+            method, *driver_args, **driver_kwargs)
         # Get the results of the ray workers.
         if self.workers:
             ray_worker_outputs = ray.get(ray_worker_outputs)
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
index 2b1cdc09b0a9f..d2086f5fef26c 100644
--- a/vllm/executor/ray_xpu_executor.py
+++ b/vllm/executor/ray_xpu_executor.py
@@ -1,11 +1,13 @@
 import asyncio
 from typing import List, Optional
 
+import ray
+
 import vllm.envs as envs
 from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
 from vllm.executor.xpu_executor import XPUExecutor
 from vllm.logger import init_logger
-from vllm.utils import get_vllm_instance_id, make_async
+from vllm.utils import make_async
 
 logger = init_logger(__name__)
 
@@ -14,15 +16,16 @@ class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
 
     def _get_env_vars_to_be_updated(self):
         # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
-
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()))  # type: ignore
 
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
         }, ) for (_, _) in worker_node_and_gpu_ids]
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 54fbd7a321a6f..d4402e77a3886 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -38,34 +38,3 @@
     "InputProcessingContext",
     "InputRegistry",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "PromptInput":
-        msg = ("PromptInput has been renamed to PromptType. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PromptType
-
-    if name == "LLMInputs":
-        msg = ("LLMInputs has been renamed to DecoderOnlyInputs. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return DecoderOnlyInputs
-
-    if name == "EncoderDecoderLLMInputs":
-        msg = (
-            "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. "
-            "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return EncoderDecoderInputs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index fb7dbbebd7b90..85aaaa776907f 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -7,7 +7,8 @@
 from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
+    from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
+                                 MultiModalPlaceholderDict)
     from vllm.multimodal.inputs import MultiModalInputsV2
 
 
@@ -150,6 +151,12 @@ class TokenInputs(TypedDict):
     if the model supports it.
     """
 
+    multi_modal_inputs: NotRequired["MultiModalKwargs"]
+    """
+    Optional multi-modal inputs to pass to the model,
+    if the model supports it.
+    """
+
     multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
     """
     Placeholder ranges for the multi-modal data.
@@ -169,6 +176,7 @@ def token_inputs(
     token_type_ids: Optional[List[int]] = None,
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
+    multi_modal_inputs: Optional["MultiModalKwargs"] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
@@ -181,6 +189,8 @@ def token_inputs(
         inputs["token_type_ids"] = token_type_ids
     if multi_modal_data is not None:
         inputs["multi_modal_data"] = multi_modal_data
+    if multi_modal_inputs is not None:
+        inputs["multi_modal_inputs"] = multi_modal_inputs
     if multi_modal_placeholders is not None:
         inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
@@ -273,6 +283,18 @@ def multi_modal_data(self) -> "MultiModalDataDict":
 
         assert_never(inputs)
 
+    @cached_property
+    def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_inputs", {})
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_kwargs", {})
+
+        assert_never(inputs)
+
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         inputs = self.inputs
@@ -358,34 +380,3 @@ def to_enc_dec_tuple_list(
     return [(enc_dec_prompt["encoder_prompt"],
              enc_dec_prompt["decoder_prompt"])
             for enc_dec_prompt in enc_dec_prompts]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "PromptInput":
-        msg = ("PromptInput has been renamed to PromptType. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PromptType
-
-    if name == "LLMInputs":
-        msg = ("LLMInputs has been renamed to DecoderOnlyInputs. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return DecoderOnlyInputs
-
-    if name == "EncoderDecoderLLMInputs":
-        msg = (
-            "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. "
-            "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return EncoderDecoderInputs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 85ab4355cc2e4..0dfed3b7e61bf 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -69,12 +69,12 @@ class InputProcessingContext(InputContext):
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
-    def get_hf_processor(self) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs) -> ProcessorMixin:
         return cached_get_processor(
             self.model_config.tokenizer,
             tokenizer=self.tokenizer,  # Override the tokenizer with ours
             trust_remote_code=self.model_config.trust_remote_code,
-        )
+            **kwargs)
 
 
 N = TypeVar("N", bound=Type[nn.Module])
@@ -232,19 +232,35 @@ def dummy_data_for_profiling(
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
-
-        model_cls, _ = get_model_architecture(model_config)
-        if is_encoder_data:
-            dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
+        from vllm.multimodal import MultiModalKwargs
+        from vllm.multimodal.utils import cached_get_tokenizer
+
+        if mm_registry.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
+            processor = mm_registry.create_processor(model_config, tokenizer)
+
+            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
+            mm_max_tokens = mm_registry.get_max_tokens_by_modality(
+                model_config)
+
+            dummy_data = processor.get_dummy_data(seq_len, mm_counts,
+                                                  mm_max_tokens)
         else:
-            dummy_factory = self._get_dummy_data_factory(model_cls)
-        mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
-        mm_processor_kwargs = get_allowed_kwarg_only_overrides(
-            dummy_factory, overrides=model_config.mm_processor_kwargs)
+            model_cls, _ = get_model_architecture(model_config)
+            if is_encoder_data:
+                dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
+            else:
+                dummy_factory = self._get_dummy_data_factory(model_cls)
+            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
+            mm_processor_kwargs = get_allowed_kwarg_only_overrides(
+                dummy_factory, overrides=model_config.mm_processor_kwargs)
 
-        dummy_data = dummy_factory(InputContext(model_config), seq_len,
-                                   _MultiModalCounts(mm_counts),
-                                   **mm_processor_kwargs)
+            dummy_data = dummy_factory(InputContext(model_config), seq_len,
+                                       _MultiModalCounts(mm_counts),
+                                       **mm_processor_kwargs)
 
         # Having more tokens is over-conservative but otherwise fine
         num_tokens = dummy_data.seq_data.prompt_token_ids
@@ -257,7 +273,9 @@ def dummy_data_for_profiling(
                 raise AssertionError(
                     f"Expected at least {seq_len} dummy tokens for profiling, "
                     f"but found {len(num_tokens)} tokens instead.")
-        if dummy_data.multi_modal_data is not None:
+
+        if (dummy_data.multi_modal_data is not None and
+                not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
             for k, v in dummy_data.multi_modal_data.items():
                 num_items = len(v) if isinstance(v, list) else 1
                 num_expected = mm_counts[k]
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index f5c2eced9d2bb..545ec21ca74c1 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,5 +1,5 @@
 # pylint: disable=unused-argument
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
 
 import torch
 import torch.nn as nn
@@ -32,6 +32,44 @@ def dec(*args, **kwargs):
     return dec
 
 
+def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
+    """ 
+    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
+    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
+    """
+    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
+        layer.lora_b_stacked) == len(layer.output_slices))
+    if layer.lora_bias_stacked is not None:
+        assert layer.n_slices == len(layer.lora_bias_stacked)
+
+    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
+
+    x = x.view(-1, x.shape[-1])
+    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+
+    # Since communication is needed, the buffer is directly initialized as a
+    # tensor rather than a tuple of tensor.
+    buffers = torch.zeros(
+        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+
+    layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0)
+    buffers = tensor_model_parallel_all_gather(buffers)
+    layer.punica_wrapper.add_expand(output,
+                                    buffers,
+                                    layer.lora_b_stacked,
+                                    layer.lora_bias_stacked,
+                                    layer.output_slices,
+                                    offset_start=0,
+                                    add_input=True)
+
+    output = output.view(*out_orig_shape)
+    # now have column partitioned and packed output
+    return output
+
+
 # these layers are based on the tensor parallelism strategy given in
 # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
 # https://arxiv.org/abs/2311.03285.
@@ -51,40 +89,15 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
     # gather operation.
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked.shape[2]
+        shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = tp_rank * shard_size
         lora_a = lora_a[:, start_idx:start_idx + shard_size]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-
-        x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
-        buffer = torch.zeros(
-            (x.shape[0], self.lora_a_stacked.shape[2]),
-            dtype=torch.float32,
-            device=x.device,
-        )
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
-        buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand(output,
-                                       buffer,
-                                       self.lora_b_stacked,
-                                       add_input=True)
-        # now have column partitioned output
-
-        if self.bias_stacked is not None:
-            self.bias_stacked = self.bias_stacked.view(
-                -1, self.bias_stacked.shape[-1])
-            self.bias_stacked = self.bias_stacked[
-                self.punica_wrapper.token_lora_indices]
-            output += self.bias_stacked
-
-        output = output.view(*out_orig_shape)
-        return output
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
 
     @classmethod
     @_fully_sharded_can_replace
@@ -105,59 +118,6 @@ def can_replace_layer(
         )
 
 
-def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
-    """
-    MergedColumnParallelLinearWithShardedLoRA and
-    MergedQKVParallelLinearWithShardedLora share the same
-    LoRa weight application method.
-    
-    The main difference is the step by shard_size for lora_b which can
-    vary for MergedQKVParallelLinearWithShardedLora but is constant for
-    MergedColumnParallelLinearWithShardedLoRA.
-    """
-    # expecting 2 for column parallel and 3 for qkv
-    n = len(layer.lora_a_stacked)
-    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
-
-    x = x.view(-1, x.shape[-1])
-    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
-    buffers = torch.zeros(
-        (n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    for idx in range(n):
-        layer.punica_wrapper.add_shrink(buffers[idx], x,
-                                        layer.lora_a_stacked[idx], 1.0)
-
-    buffers = tensor_model_parallel_all_gather(buffers)
-    left_offset = 0
-    for idx in range(n):
-        shard_size = layer.lora_b_stacked[idx].shape[2]
-
-        if layer.bias_stacked is not None:
-            bias = layer.bias_stacked[idx]
-            if bias is not None:
-                bias = bias.view(-1, bias.shape[-1])
-                bias = bias[layer.punica_wrapper.token_lora_indices]
-                bias[layer.punica_wrapper.token_lora_indices == -1] = 0
-                output[:, left_offset:left_offset + shard_size] += bias
-
-        layer.punica_wrapper.add_expand_slice(
-            output,
-            buffers[idx],
-            layer.lora_b_stacked[idx],
-            left_offset,
-            shard_size,
-            add_input=True,
-        )
-        left_offset += shard_size
-
-    output = output.view(*out_orig_shape)
-    # now have column partitioned and packed output
-    return output
-
-
 class MergedColumnParallelLinearWithShardedLoRA(
         MergedColumnParallelLinearWithLoRA):
     """
@@ -181,8 +141,9 @@ def slice_lora_a(
         ]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -214,30 +175,15 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked.shape[2]
+        shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = tp_rank * shard_size
         lora_a = lora_a[:, start_idx:start_idx + shard_size]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-
-        x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
-        buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand(output,
-                                       buffer,
-                                       self.lora_b_stacked,
-                                       add_input=True)
-        # now have column partitioned output
-        output = output.view(*out_orig_shape)
-        return output
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
 
     @classmethod
     @_fully_sharded_can_replace
@@ -278,8 +224,9 @@ def slice_lora_a(
         ]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -312,7 +259,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
     """
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        shard_size = self.lora_b_stacked.shape[2]
+        shard_size = self.lora_b_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         lora_b = lora_b[:, start_idx:end_idx]
@@ -321,20 +268,24 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         if bias is None:
             return bias
-        shard_size = self.bias_stacked.shape[2]
+        self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                      self.lora_bias_stacked)
+        shard_size = self.lora_bias_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         bias = bias[start_idx:end_idx]
         return bias
 
-    def apply(self, x: torch.Tensor) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
         buffer = torch.zeros(
-            (x.shape[0], self.lora_a_stacked.shape[2]),
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
             dtype=torch.float32,
             device=x.device,
         )
@@ -348,18 +299,18 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # remains is a standard all_reduce. User should be aware though that
         # the output is not the same as a normal row_parallel, it should be
         # reduced before being used
-        shard_size = self.lora_b_stacked.shape[2]
-        start_idx = self.tp_rank * shard_size
-
-        if self.bias_stacked is not None:
-            bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1])
-            bias = bias[self.punica_wrapper.token_lora_indices]
-            bias[self.punica_wrapper.token_lora_indices == -1] = 0
-            output += bias
-
-        self.punica_wrapper.add_expand_slice(output, buffer,
-                                             self.lora_b_stacked, start_idx,
-                                             shard_size)
+        # NOTE offset are based on the rank.
+        shard_size = self.lora_b_stacked[0].shape[2]
+        offset_start = self.tp_rank * shard_size
+        self.punica_wrapper.add_expand(
+            output,
+            buffer,
+            self.lora_b_stacked,
+            self.lora_bias_stacked,
+            self.output_slices,
+            offset_start=offset_start,
+            add_input=True,
+        )
         output = output.view(*out_orig_shape)
         return output
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 3701988ff692f..38cb846578d5c 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,7 +1,7 @@
 # pylint: disable=unused-argument
 import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast
 
 import torch
 import torch.nn as nn
@@ -17,12 +17,14 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.lora.punica import PunicaWrapper
+# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
+# yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
     LinearScalingRotaryEmbedding, RotaryEmbedding)
@@ -30,7 +32,7 @@
     VocabParallelEmbedding)
 
 if TYPE_CHECKING:
-    pass
+    from vllm.lora.punica_wrapper import PunicaWrapperBase
 
 
 def _get_lora_device(base_layer: nn.Module) -> torch.device:
@@ -67,63 +69,6 @@ def dec(*args, **kwargs):
     return dec
 
 
-def apply_bias(
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    bias_stacked: torch.Tensor,
-):
-    """Applies bias to output
-
-    Input shapes:
-        bias_stacked:    (num_loras, output_dim)
-        indices:         (batch_size)
-        output:          (batch_size, output_dim)
-    """
-    org_output = output
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-
-    bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1])
-    bias_stacked = bias_stacked[indices]
-    bias_stacked[indices == -1] = 0
-    output += bias_stacked
-
-    return output.view_as(org_output)
-
-
-def apply_bias_packed_nslice(
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-    bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-):
-    """Applies bias to output
-
-    Input shapes:
-        bias_stacked:      3 element tuple of (num_loras, output_dim)
-        indices:           (batch_size)
-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...),
-                           where n is number of slices
-    """
-    org_output = output
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-
-    offset_left = 0
-    for slice_idx, slice in enumerate(output_slices):
-        bias = bias_stacked[slice_idx]
-        if bias is not None:
-            bias = bias.view(-1, bias.shape[-1])
-            bias = bias[indices]
-            bias[indices == -1] = 0
-            output[:, offset_left:offset_left + slice] += bias
-
-        offset_left += slice
-
-    return output.view_as(org_output)
-
-
 @dataclass
 class LoRAMapping(AdapterMapping):
     is_prefill: bool = False
@@ -169,9 +114,9 @@ def set_lora(
 
     def set_mapping(
         self,
-        punica_wrapper: PunicaWrapper,
+        punica_wrapper,
     ):
-        self.punica_wrapper: PunicaWrapper = punica_wrapper
+        self.punica_wrapper: PunicaWrapperBase = punica_wrapper
 
     @classmethod
     def can_replace_layer(
@@ -306,12 +251,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-
-        # Embedding layer only need expand op
-        self.punica_wrapper.add_expand(full_output,
-                                       full_lora_a_embeddings,
-                                       self.lora_b_stacked,
-                                       add_input=True)
+        self.punica_wrapper.add_lora_embedding(full_output,
+                                               full_lora_a_embeddings,
+                                               self.lora_b_stacked,
+                                               add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -325,14 +268,19 @@ def can_replace_layer(
         return type(source_layer) is VocabParallelEmbedding
 
 
-class ReplicatedLinearWithLoRA(BaseLayerWithLoRA):
+class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
 
-    def __init__(self, base_layer: ReplicatedLinear) -> None:
+    def __init__(self, base_layer: LinearBase):
         super().__init__()
         self.base_layer = base_layer
         self.input_size = self.base_layer.input_size
-        self.output_size = self.base_layer.output_size
         self.device = _get_lora_device(self.base_layer)
+        self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
+
+        self.output_slices: Tuple[int, ...]
+        self.tp_size: int
+        self.output_size: int
+        self.n_slices: int
 
     def create_lora_weights(
         self,
@@ -341,39 +289,64 @@ def create_lora_weights(
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
         self.lora_config = lora_config
-        lora_a_output_size = lora_config.max_lora_rank
-        self.lora_a_stacked = torch.zeros(
-            max_loras,
-            1,
-            lora_a_output_size,
-            self.input_size,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            max_loras,
-            1,
-            self.output_size,
-            lora_config.max_lora_rank,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
+        #
+        if isinstance(self.base_layer, ReplicatedLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, ColumnParallelLinear):
+            lora_a_out_size = (lora_config.max_lora_rank if
+                               not lora_config.fully_sharded_loras else divide(
+                                   lora_config.max_lora_rank, self.tp_size))
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, RowParallelLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = (self.output_size if
+                               not lora_config.fully_sharded_loras else divide(
+                                   self.output_size, self.tp_size))
+        else:
+            raise NotImplementedError
+
+        self.lora_a_stacked = tuple(
+            torch.zeros(
                 max_loras,
                 1,
-                self.output_size,
+                lora_a_out_size,
+                self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            )
-        else:
-            self.bias_stacked = None
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_b_out_size,
+                lora_config.max_lora_rank,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        if lora_config.bias_enabled:
+            lora_bias_out_size = lora_b_out_size
+            self.lora_bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    lora_bias_out_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for _ in range(self.n_slices))
+        self.output_slices = (self.lora_b_stacked[0].shape[2], )
 
     def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        for s_index in range(self.n_slices):
+            self.lora_a_stacked[s_index][index] = 0
+            self.lora_b_stacked[s_index][index] = 0
+            if self.lora_config.bias_enabled:
+                # Make mypy happy
+                self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                              self.lora_bias_stacked)
+                self.lora_bias_stacked[s_index][index] = 0
 
     def set_lora(
         self,
@@ -381,35 +354,56 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        lora_bias: Optional[torch.Tensor] = None,
     ):
-        self.reset_lora(index)
+        # Except for QKVParallelLinearWithLora and
+        # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
+        # store weights in a tuple of size 1. These two layers will
+        # override this function.
+        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
+                self.n_slices == 1)
 
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+        self.reset_lora(index)
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        self.lora_a_stacked[0][index,
+                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                   lora_a.T, non_blocking=True)
+        self.lora_b_stacked[0][index,
+                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                   lora_b.T, non_blocking=True)
+        if lora_bias is not None:
+
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            assert len(self.lora_bias_stacked)
+            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
+                lora_bias.T, non_blocking=True)
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
+        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
+                                            self.lora_b_stacked,
+                                            self.lora_bias_stacked, 1.0,
+                                            self.output_slices)
         return output
 
+
+class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
+
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__(base_layer, )
+        # To ensure interface compatibility, set to 1 always.
+        self.tp_size = 1
+        self.output_size = self.base_layer.output_size
+        self.n_slices = 1
+
     def forward(self, input_):
         """Forward of ReplicatedLinearWithLoRA
 
@@ -442,73 +436,26 @@ def can_replace_layer(
         return type(source_layer) is ReplicatedLinear
 
 
-class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
+class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
-
     LoRA B is sliced for tensor parallelism.
+    There are two types for the `base_layer`:
+    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
+    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
     """
 
     def __init__(self, base_layer: ColumnParallelLinear) -> None:
-        super().__init__()
+        super().__init__(base_layer)
         # The base_layer type is ColumnParallelLinear or
         # MergedColumnParallelLinear, their weight sharding logic is
         # inconsistent when TP is greater than 1.
         self.is_merged_col_linear = type(
             base_layer) is MergedColumnParallelLinear
-
-        self.base_layer = base_layer
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.input_size = self.base_layer.input_size
         self.output_size = self.base_layer.output_size_per_partition
-        self.device = _get_lora_device(self.base_layer)
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
-        self.tp_size = get_tensor_model_parallel_world_size()
-        lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
-        self.lora_a_stacked = torch.zeros(
-            max_loras,
-            1,
-            lora_a_output_size_per_partition,
-            self.input_size,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            max_loras,
-            1,
-            self.output_size,
-            lora_config.max_lora_rank,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
-                max_loras,
-                1,
-                self.output_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            )
-        else:
-            self.bias_stacked = None
-
-        self.output_dim = self.lora_b_stacked.shape[2]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        # There is only one LoRA layer
+        self.n_slices = 1
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         return lora_a
@@ -547,46 +494,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = bias[start_idx:end_idx]
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
-        return output
-
     def forward(self, input_):
         """Forward of ColumnParallelLinear
 
@@ -634,8 +541,20 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     Both slices must have the same size.
     """
 
-    def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
+    def __init__(
+        self, base_layer: Union[MergedColumnParallelLinear,
+                                QKVParallelLinear]) -> None:
         super().__init__(base_layer)
+        # There are two LoRA layers
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        # the output_sizes in MergedColumnParallelLinear is not sharded by tp
+        # we need to divide it by the tp_size to get correct slices size
+        output_sizes = self.base_layer.output_sizes
+        self.output_slices = tuple(
+            divide(output_size, self.tp_size) for output_size in output_sizes)
+        self.n_slices = len(self.output_slices)
+        self.output_ids = (self.tp_rank, ) * self.n_slices
 
     def create_lora_weights(
         self,
@@ -643,16 +562,11 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
+        """
+        The main reason for overriding this function is to enhance  code 
+        maintainability.
+        """
         self.lora_config = lora_config
-        n_slices = 2
-        if not (len(self.base_layer.output_sizes) == n_slices
-                and self.base_layer.output_sizes[0]
-                == self.base_layer.output_sizes[1]):
-            raise ValueError(
-                "LoRAColumnParallelLinear2Slice requires 2 slices with "
-                "the same size.")
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
 
         lora_a_output_size_per_partition = (
             lora_config.max_lora_rank if not lora_config.fully_sharded_loras
@@ -666,38 +580,25 @@ def create_lora_weights(
                 self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(n_slices))
+            ) for _ in range(self.n_slices))
         self.lora_b_stacked = tuple(
             torch.zeros(
                 max_loras,
                 1,
-                self.output_size // 2,
+                output_size,
                 lora_config.max_lora_rank,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(n_slices))
+            ) for output_size in self.output_slices)
         if lora_config.bias_enabled:
-            self.bias_stacked = tuple(
+            self.lora_bias_stacked = tuple(
                 torch.zeros(
                     max_loras,
                     1,
-                    self.output_size // 2,
+                    output_size,
                     dtype=lora_config.lora_dtype,
                     device=self.device,
-                ) for _ in range(n_slices))
-        else:
-            self.bias_stacked = None
-
-        self.output_dim = self.lora_b_stacked[0].shape[2]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_a_stacked[1][index] = 0
-        self.lora_b_stacked[0][index] = 0
-        self.lora_b_stacked[1][index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[0][index] = 0
-            self.bias_stacked[1][index] = 0
+                ) for output_size in self.output_slices)
 
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
@@ -707,27 +608,21 @@ def slice_lora_a(
     def slice_lora_b(
         self, lora_b: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
-        #NOTE: lora_b contains 2 subloras, and each sublora could be None.
-        shard_size = self.output_dim
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        lora_b = [
-            lora_b[0][:, start_idx:end_idx] if lora_b[0] is not None else None,
-            lora_b[1][:, start_idx:end_idx] if lora_b[1] is not None else None,
-        ]
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (lora_b_i := lora_b[i]) is not None:
+                lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
+                                     (shard_id + 1)]
         return lora_b
 
     def slice_bias(
         self, bias: List[Union[torch.Tensor,
                                None]]) -> List[Union[torch.Tensor, None]]:
-        # NOTE : each bias could be None.
-        shard_size = self.output_dim
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        bias = [
-            bias[0][start_idx:end_idx] if bias[0] is not None else None,
-            bias[1][start_idx:end_idx] if bias[1] is not None else None
-        ]
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (bias_i := bias[i]) is not None:
+                bias[i] = bias_i[shard_size * shard_id:shard_size *
+                                 (shard_id + 1)]
         return bias
 
     def set_lora(
@@ -736,54 +631,35 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        lora_bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        if lora_a[0] is not None:
-            self.lora_a_stacked[0][
-                index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
-                    lora_a[0].T, non_blocking=True)
-            self.lora_b_stacked[0][
-                index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
-                    lora_b[0].T, non_blocking=True)
-        if bias is not None and bias[0] is not None:
-            self.bias_stacked[0][index,
-                                 0, :bias[0].shape[0]].copy_(bias[0].T,
-                                                             non_blocking=True)
-        if lora_a[1] is not None:
-            self.lora_a_stacked[1][
-                index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
-                    lora_a[1].T, non_blocking=True)
-            self.lora_b_stacked[1][
-                index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
-                    lora_b[1].T, non_blocking=True)
-        if bias is not None and bias[1] is not None:
-            self.bias_stacked[1][index,
-                                 0, :bias[1].shape[0]].copy_(bias[1].T,
-                                                             non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias_packed_nslice(
-                self.indices,
-                output,
-                (self.output_dim, self.output_dim),
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora_packed_nslice(
-            output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0,
-            (self.output_dim, self.output_dim))
-        return output
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        for i in range(self.n_slices):
+            if (lora_a_i := lora_a[i]) is not None:
+                self.lora_a_stacked[i][
+                    index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_(
+                        lora_a_i.T, non_blocking=True)
+            if (lora_b_i := lora_b[i]) is not None:
+                self.lora_b_stacked[i][
+                    index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_(
+                        lora_b_i.T, non_blocking=True)
+
+        if lora_bias is not None:
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            for i in range(self.n_slices):
+                if (lora_bias_i := lora_bias[i]) is not None:
+                    self.lora_bias_stacked[i][index,
+                                              0, :lora_bias_i.shape[0]].copy_(
+                                                  lora_bias_i.T,
+                                                  non_blocking=True)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -813,7 +689,6 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
-        self.tp_size = get_tensor_model_parallel_world_size()
         self.q_proj_total_size = (self.base_layer.total_num_heads *
                                   self.base_layer.head_size)
         self.q_proj_shard_size = (self.base_layer.num_heads *
@@ -822,6 +697,8 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
                                    self.base_layer.head_size)
         self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
                                    self.base_layer.head_size)
+        # There is only one LoRA layer
+        self.n_slices = 1
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
@@ -856,32 +733,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(cls, source_layer: nn.Module,
@@ -891,8 +742,8 @@ def can_replace_layer(cls, source_layer: nn.Module,
             packed_modules_list) == 1
 
 
-class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
-    """ColumnParallelLinear layer that is composed of 3 sublayers (slices)
+class MergedQKVParallelLinearWithLora(MergedColumnParallelLinearWithLoRA):
+    """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
     packed together in qkv proj fashion
     (q_proj + k_proj + v_proj -> qkv_proj).
 
@@ -904,16 +755,11 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
+        # There are three LoRA layer.
+        self.n_slices = len(self.base_layer.output_sizes)
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
+
         self.q_proj_shard_size = (self.base_layer.num_heads *
                                   self.base_layer.head_size)
         self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
@@ -921,227 +767,28 @@ def create_lora_weights(
         self.q_shard_id = self.tp_rank
         self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
 
-        lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
-        # q, k, v
-        self.lora_a_stacked = (
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_output_size_per_partition,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_output_size_per_partition,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_output_size_per_partition,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-        )
-        self.lora_b_stacked = (
-            torch.zeros(
-                max_loras,
-                1,
-                self.q_proj_shard_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                self.kv_proj_shard_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                self.kv_proj_shard_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-        )
-        if lora_config.bias_enabled:
-            self.bias_stacked = (
-                torch.zeros(
-                    max_loras,
-                    1,
-                    self.q_proj_shard_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ),
-                torch.zeros(
-                    max_loras,
-                    1,
-                    self.kv_proj_shard_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ),
-                torch.zeros(
-                    max_loras,
-                    1,
-                    self.kv_proj_shard_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ),
-            )
-        else:
-            self.bias_stacked = None
-
         self.output_slices = (
             self.q_proj_shard_size,
             self.kv_proj_shard_size,
             self.kv_proj_shard_size,
         )
-        self.packed_indices: Optional[torch.Tensor] = None
-        self.standard_indices: Optional[torch.Tensor] = None
-        # lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_b_stacked[0][index] = 0
-        self.lora_a_stacked[1][index] = 0
-        self.lora_b_stacked[1][index] = 0
-        self.lora_a_stacked[2][index] = 0
-        self.lora_b_stacked[2][index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[0][index] = 0
-            self.bias_stacked[1][index] = 0
-            self.bias_stacked[2][index] = 0
-
-    def slice_lora_a(
-        self, lora_a: List[Union[torch.Tensor, None]]
-    ) -> List[Union[torch.Tensor, None]]:
-        return lora_a
-
-    def slice_lora_b(
-        self, lora_b: List[Union[torch.Tensor, None]]
-    ) -> List[Union[torch.Tensor, None]]:
-        lora_b_q, lora_b_k, lora_b_v = None, None, None
-        if lora_b[0] is not None:
-            lora_b_q = lora_b[0][:, self.q_proj_shard_size *
-                                 self.q_shard_id:self.q_proj_shard_size *
-                                 (self.q_shard_id + 1), ]
-        if lora_b[1] is not None:
-            lora_b_k = lora_b[1][:, self.kv_proj_shard_size *
-                                 self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1), ]
-        if lora_b[2] is not None:
-            lora_b_v = lora_b[2][:, self.kv_proj_shard_size *
-                                 self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1), ]
-        lora_b = [lora_b_q, lora_b_k, lora_b_v]
-        return lora_b
-
-    def slice_bias(
-        self, bias: List[Union[torch.Tensor,
-                               None]]) -> List[Union[torch.Tensor, None]]:
-        bias_q, bias_k, bias_v = bias
-        if bias_q is not None:
-            bias_q = bias_q[self.q_proj_shard_size *
-                            self.q_shard_id:self.q_proj_shard_size *
-                            (self.q_shard_id + 1)]
-        if bias_k is not None:
-            bias_k = bias_k[self.kv_proj_shard_size *
-                            self.kv_shard_id:self.kv_proj_shard_size *
-                            (self.kv_shard_id + 1)]
-        if bias_v is not None:
-            bias_v = bias_v[self.kv_proj_shard_size *
-                            self.kv_shard_id:self.kv_proj_shard_size *
-                            (self.kv_shard_id + 1)]
-        bias = [bias_q, bias_k, bias_v]
-        return bias
+        self.output_ids = (
+            self.q_shard_id,
+            self.kv_shard_id,
+            self.kv_shard_id,
+        )
 
-    def set_lora(
+    def create_lora_weights(
         self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        if lora_b[0] is not None:
-            lora_b_q = lora_b[0]
-            self.lora_b_stacked[0][
-                index, 0, :lora_b_q.shape[1], :lora_b_q.shape[0]].copy_(
-                    lora_b_q.T, non_blocking=True)
-        if lora_b[1] is not None:
-            lora_b_k = lora_b[1]
-            self.lora_b_stacked[1][
-                index, 0, :lora_b_k.shape[1], :lora_b_k.shape[0]].copy_(
-                    lora_b_k.T, non_blocking=True)
-        if lora_b[2] is not None:
-            lora_b_v = lora_b[2]
-            self.lora_b_stacked[2][
-                index, 0, :lora_b_v.shape[1], :lora_b_v.shape[0]].copy_(
-                    lora_b_v.T, non_blocking=True)
-
-        if lora_a[0] is not None:
-            self.lora_a_stacked[0][
-                index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
-                    lora_a[0].T, non_blocking=True)
-        if lora_a[1] is not None:
-            self.lora_a_stacked[1][
-                index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
-                    lora_a[1].T, non_blocking=True)
-        if lora_a[2] is not None:
-            self.lora_a_stacked[2][
-                index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_(
-                    lora_a[2].T, non_blocking=True)
-
-        if bias is not None:
-            if bias[0] is not None:
-                self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_(
-                    bias[0].T, non_blocking=True)
-            if bias[1] is not None:
-                self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_(
-                    bias[1].T, non_blocking=True)
-            if bias[2] is not None:
-                self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_(
-                    bias[2].T, non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias_packed_nslice(
-                self.indices,
-                output,
-                self.output_slices,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora_packed_nslice(output, x,
-                                                   self.lora_a_stacked,
-                                                   self.lora_b_stacked, 1.0,
-                                                   self.output_slices)
-        return output
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """
+        The main reason for overloading this function is to handle inconsistent 
+        weight dimensions in qkv lora.
+        """
+        super().create_lora_weights(max_loras, lora_config, model_config)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -1156,76 +803,25 @@ def can_replace_layer(
                 and len(packed_modules_list) == 3)
 
 
-class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
+class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def __init__(self, base_layer: RowParallelLinear) -> None:
-        super().__init__()
-        self.base_layer = base_layer
+        super().__init__(base_layer)
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # reset input_size
         self.input_size = self.base_layer.input_size_per_partition
         self.output_size = self.base_layer.output_size
-        self.device = _get_lora_device(self.base_layer)
 
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_config.max_lora_rank,
-                self.input_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        tp_size = get_tensor_model_parallel_world_size()
-        lora_b_output_size_per_partition = (
-            self.output_size if not lora_config.fully_sharded_loras else
-            divide(self.output_size, tp_size))
-
-        self.lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_b_output_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
-                (
-                    max_loras,
-                    1,
-                    self.output_size,
-                ),
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            )
-        else:
-            self.bias_stacked = None
-        # Lazily initialized
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        # There is only one LoRA layer.
+        self.n_slices = 1
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+
         shard_size = self.input_size
-        start_idx = tensor_model_parallel_rank * shard_size
-        end_idx = (tensor_model_parallel_rank + 1) * shard_size
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
         lora_a = lora_a[start_idx:end_idx, :]
         return lora_a
 
@@ -1235,46 +831,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.base_layer.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
-    def apply(self, x: torch.Tensor) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
-        return output
-
     def forward(self, input_):
         """Forward of RowParallelLinear
 
@@ -1292,10 +848,9 @@ def forward(self, input_):
             input_parallel = input_
         else:
             # TODO: simplify code below
-            tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.base_layer.tp_size)
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
         output_parallel = self.apply(input_parallel)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 2ffefe61427e3..49cd9f0c236ad 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -21,7 +21,7 @@
                               LinearScalingRotaryEmbeddingWithLora,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.punica import PunicaWrapper
+from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              is_regex_target_modules,
                              parse_fine_tuned_lora_name, replace_submodule)
@@ -331,9 +331,9 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
-                                            max_batches=self.max_num_seqs,
-                                            device=self.device)
+        self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens,
+                                                 max_batches=self.max_num_seqs,
+                                                 device=self.device)
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
@@ -555,17 +555,17 @@ def create_dummy_lora(
                         input_dim,
                         output_dim,
                         rank,
-                        module.lora_a_stacked.dtype,
+                        module.lora_a_stacked[0].dtype,
                         "cpu",
                         embeddings_tensor_dim=embeddings_tensor_dim,
                         bias_enabled=bias_enabled)
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
-                        module.lora_a_stacked.shape[-1],
-                        module.lora_b_stacked.shape[-2],
+                        module.lora_a_stacked[0].shape[-1],
+                        module.lora_b_stacked[0].shape[-2],
                         rank,
-                        module.lora_a_stacked.dtype,
+                        module.lora_a_stacked[0].dtype,
                         "cpu",
                         bias_enabled=bias_enabled,
                     )
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
deleted file mode 100644
index 082041f390750..0000000000000
--- a/vllm/lora/punica.py
+++ /dev/null
@@ -1,611 +0,0 @@
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
-
-import torch
-
-from vllm.triton_utils import HAS_TRITON
-
-if HAS_TRITON:
-    from vllm.lora.ops.bgmv_expand import bgmv_expand
-    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-    from vllm.lora.ops.sgmv_expand import sgmv_expand
-    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
-    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-
-if TYPE_CHECKING:
-    # avoid circuit import
-    from vllm.lora.layers import LoRAMapping
-    from vllm.lora.models import LongContextLoRAContext
-
-
-def compute_meta(
-    token_lora_tensor: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
-    """
-    Get the information required for the sgmv kernel. With the  features:
-    1. If consecutive requests in the batch use the same LoRA, this function
-    will combine them into a single request, improving sgmv kernel inference
-    performance.
-    2. At the beginning of each prefill stage inference, recalculations are
-    needed based on the input, but only once.
-    """
-
-    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
-        token_lora_tensor, return_counts=True)
-    cum_result = torch.cumsum(seq_length_tensor, dim=0)
-    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
-    b_seq_start_tensor[1:].copy_(cum_result[:-1])
-    max_length = seq_length_tensor.max().item()
-    token_nums = seq_length_tensor.sum().item()
-    batch_size = lora_indices_tensor.size(0)
-    no_lora = False
-    # -1 means no lora should be applied. Use `no_lora` to determine whether
-    # the current step requires LoRA. If LoRA is not needed, the prefill stage
-    # does not need to launch the triton kernel, which can improve performance
-    if batch_size == 1 and lora_indices_tensor == -1:
-        no_lora = True
-    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-            batch_size, max_length, token_nums, no_lora)
-
-
-# TODO see if this can be vectorized
-def convert_mapping(
-    mapping: "LoRAMapping",
-    lora_index_to_id: List[Optional[int]],
-    max_loras: int,
-    vocab_size: int,
-    extra_vocab_size: int,
-    device: torch.device,
-    long_lora_context: Optional["LongContextLoRAContext"] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int]]:
-    """Converts LoRAMapping to index tensors.
-
-    Args:
-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
-        max_loras: Maximum number of LoRAs.
-        vocab_size: Model vocab size.
-        extra_vocab_size: Extra vocab size each LoRA can have.
-        long_lora_context: Passed if there are long context lora in a batch.
-
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                LoRA indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
-                to LoRA indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_loras.
-            embeddings_indices: Tensor of shape [2, batch_size] mapping
-                requests to embedding indices. First row is for embeddings
-                added by the LoRAs, second row is for the LoRA.lora_a
-                embeddings.
-            long_lora_indices: Tensor of shape [batch_size] mapping
-                requests to RoPE offsets and rot dims for long LoRAs.
-                None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors. It contains
-                (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices).
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
-    long_lora_offsets: Optional[torch.Tensor] = None
-    if long_lora_context:
-        long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device=device,
-                                        dtype=torch.long)
-    prompt_mapping: List[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    lora_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
-        if long_lora_context:
-            assert long_lora_offsets is not None
-            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                index_mapping_indices[i], 0)
-            long_lora_offsets[i] = lora_offset
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices,
-        lora_indices,
-        embedding_indices,
-    ]
-    if long_lora_context:
-        assert long_lora_offsets is not None
-        indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         dtype=torch.long,
-                                         device=device)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size),
-    ])
-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
-            sampler_indices_padded * len(sampler_indices_padded))
-    long_lora_indices = None
-    long_lora_indices_len: Optional[int] = None
-    if long_lora_context:
-        long_lora_indices = indices[3]
-        long_lora_indices_len = long_lora_indices.shape[-1]
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1],
-        sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1],
-        embeddings_indices.shape[-1],
-    ]
-    if long_lora_indices_len is not None:
-        indices_len.append(long_lora_indices_len)
-    else:
-        # If long_lora doesn't exist,append None
-        indices_len.append(None)
-
-    return (
-        base_indices,
-        sampler_indices,
-        sampler_indices_padded,
-        embeddings_indices,
-        long_lora_indices,
-        indices_len,
-    )
-
-
-class PunicaWrapper:
-    """
-    PunicaWrapper is designed to manage and provide metadata for the punica 
-    kernel. The main function is to maintain the state information for 
-    Multi-LoRA, and to provide the interface for the punica kernel.
-    """
-
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str]):
-        self._token_lora_indices = torch.empty(max_num_batched_tokens,
-                                               dtype=torch.long,
-                                               device=device)
-        self._sampler_indices = torch.empty(max_num_batched_tokens,
-                                            dtype=torch.long,
-                                            device=device)
-        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
-                                                   dtype=torch.long,
-                                                   device=device)
-        self._embeddings_indices = torch.empty(2,
-                                               max_num_batched_tokens,
-                                               dtype=torch.long,
-                                               device=device)
-        self._long_lora_indices = torch.empty(max_num_batched_tokens,
-                                              dtype=torch.long,
-                                              device=device)
-
-        # 5 is the number of indicies tensors.
-        # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices,long_lora_indices
-        self.indices_len: List[Optional[int]] = [None] * 5
-        # these attributes are the information required for sgmv kernel
-        self._seq_start_locs = torch.empty(max_batches,
-                                           dtype=torch.long,
-                                           device=device)
-        self._seq_lengths = torch.empty(max_batches,
-                                        dtype=torch.long,
-                                        device=device)
-        self._lora_indices_per_batch = torch.empty(max_batches,
-                                                   dtype=torch.long,
-                                                   device=device)
-        self.device: torch.device = device
-        self.max_length: int = 0
-        self.token_nums: int = 0
-        self.batch_size: int = -1
-        self.is_prefill = False
-        self.no_lora = False
-
-    def update_metadata(
-        self,
-        mapping: "LoRAMapping",
-        lora_index_to_id: List[Optional[int]],
-        max_loras: int,
-        vocab_size: int,
-        extra_vocab_size: int,
-        long_lora_context: Optional["LongContextLoRAContext"] = None,
-    ):
-
-        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-                                   vocab_size, extra_vocab_size,
-                                   long_lora_context)
-        if mapping.is_prefill:
-            # Update metadata required for prefill-related operators.
-            self._update_prefill_metada(self.token_lora_indices)
-            self.is_prefill = True
-        else:
-            self.is_prefill = False
-
-    def _update_base_metadata(
-        self,
-        mapping: "LoRAMapping",
-        lora_index_to_id: List[Optional[int]],
-        max_loras: int,
-        vocab_size: int,
-        extra_vocab_size: int,
-        long_lora_context: Optional["LongContextLoRAContext"] = None,
-    ):
-        (
-            base_indices,
-            sampler_indices,
-            sampler_indices_padded,
-            embeddings_indices,
-            long_lora_offsets_tensor,
-            indices_len,
-        ) = convert_mapping(
-            mapping,
-            lora_index_to_id,
-            max_loras,
-            vocab_size,
-            extra_vocab_size,
-            self.device,
-            long_lora_context,
-        )
-        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
-        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self._embeddings_indices[:embeddings_indices.
-                                 shape[0], :embeddings_indices.shape[1]].copy_(
-                                     embeddings_indices)
-        if long_lora_offsets_tensor is not None:
-            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
-                long_lora_offsets_tensor)
-        else:
-            self._long_lora_indices.zero_()
-        self.indices_len[:] = indices_len
-
-    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
-
-        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-         batch_size, max_length, token_nums,
-         no_lora) = compute_meta(token_lora_tensor)
-
-        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
-            b_seq_start_tensor)
-        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
-        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
-            lora_indices_tensor)
-        self.batch_size = batch_size
-        self.max_length = max_length
-        self.token_nums = token_nums
-        self.no_lora = no_lora
-
-    @property
-    def prefill_metadata(
-        self
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]:
-        """
-        This property provides a convenient way to access the necessary 
-        metadata for prefill-related  kernel computations.
-            1. seq_start_locs: Tensor of sequence start positions.
-            2. seq_lengths: Tensor of sequence lengths.
-            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
-                -1 means no lora should be applied.
-            4. batch_size: Batch size after clustering identical lora indices.
-            5. max_length: The maximum sequence length in the batch.
-            6. token_nums: The token numbers in the batch.
-        """
-        return (self._seq_start_locs[:self.batch_size],
-                self._seq_lengths[:self.batch_size],
-                self._lora_indices_per_batch[:self.batch_size],
-                self.batch_size, self.max_length, self.token_nums)
-
-    @property
-    def token_lora_indices(self) -> torch.Tensor:
-        """
-        This property provides the lora indices corresponding to each token 
-        in the batch. An index of -1 means no lora should be applied.
-        """
-        token_lora_len = self.indices_len[0]
-        return self._token_lora_indices[:token_lora_len]
-
-    @property
-    def sampler_indices(self) -> torch.Tensor:
-        """ 
-        This property is used to access the lora indices specifically for 
-        LogitsProcessorWithLoRA.
-        """
-        sampler_indices_len = self.indices_len[1]
-        return self._sampler_indices[:sampler_indices_len]
-
-    @property
-    def sampler_indices_padded(self) -> torch.Tensor:
-        """
-        This property provides access to padded sampler indices.
-        """
-        indices_padded_len = self.indices_len[2]
-        return self._sampler_indices_padded[:indices_padded_len]
-
-    @property
-    def embeddings_indices(self) -> torch.Tensor:
-        """
-        This property provides access to the indices used for lora embeddings, 
-        specifically for VocabParallelEmbeddingWithLoRA.
-        """
-        embeddings_indices_len = self.indices_len[3]
-        return self._embeddings_indices[:, :embeddings_indices_len]
-
-    @property
-    def long_lora_indices(self) -> torch.Tensor:
-        """ 
-        This property provides access to the indices used for long context 
-        lora, specifically for LinearScalingRotaryEmbeddingWithLora.
-        """
-        long_lora_len = self.indices_len[4]
-        return self._long_lora_indices[:long_lora_len]
-
-    def shrink_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_shrink(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            scale,
-        )
-
-    def shrink_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
-
-    def expand_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_input: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_expand(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            add_input,
-        )
-
-    def expand_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_input: bool,
-    ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
-
-    def expand_slice_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_input: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_expand_slice(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            y_offset,
-            y_slice_size,
-            add_input,
-        )
-
-    def expand_slice_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_input: bool,
-    ):
-        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_input)
-
-    def add_shrink(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the shrink_decode function
-        should be called.
-        """
-        shrink_fun: Callable = (self.shrink_prefill
-                                if self.is_prefill else self.shrink_decode)
-        shrink_fun(y, x, w_t_all, scale)
-
-    def add_expand(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_input: bool = True,
-    ):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'b.
-        When `is_prefill` is true, it indicates that it is currently the
-        prefill stage, and the `expand_prefill` function should be called.
-        Otherwise, it is the decode stage, and the expand_decode function
-        should be called.
-        """
-
-        expand_fun: Callable = (self.expand_prefill
-                                if self.is_prefill else self.expand_decode)
-        expand_fun(y, x, w_t_all, add_input)
-
-    def add_expand_slice(self,
-                         y: torch.Tensor,
-                         x: torch.Tensor,
-                         w_t_all: torch.Tensor,
-                         y_offset: Optional[int],
-                         y_slice_size: Optional[int],
-                         add_input: bool = True):
-        """
-        Similar to `add_expand`
-        """
-
-        expand_slice_fun: Callable = (self.expand_slice_prefill
-                                      if self.is_prefill else
-                                      self.expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
-
-    def add_lora(self,
-                 y: torch.Tensor,
-                 x: torch.Tensor,
-                 wa_t_all: torch.Tensor,
-                 wb_t_all: torch.Tensor,
-                 scale: float,
-                 y_offset: Optional[int] = None,
-                 y_slice_size: Optional[int] = None,
-                 *,
-                 buffer: Optional[torch.Tensor] = None) -> None:
-        """
-        Semantics:
-        y[i] += (
-            x[i].unsqueeze(0)
-            @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-            @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-            * scale
-            ).squeeze(0)
-        Args:
-            y (torch.Tensor):  Output tensor. Will be changed in-place.
-            x (torch.Tensor): Input tensor
-            wa_t_all (torch.Tensor): lora_a's weight
-            wb_t_all (torch.Tensor): lora_b's weight
-            scale (float): Scaling factor.
-            y_offset (Optional[int], optional): Offset to apply to the starting
-                column of y.
-            y_slice_size (Optional[int], optional): Size of the y column slice.
-            buffer (Optional[torch.Tensor], optional): Defaults to None.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default ,refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-
-        self.add_shrink(buffer, x, wa_t_all, scale)
-        if y_offset is None and y_slice_size is None:
-            self.add_expand(y, buffer, wb_t_all, add_input=True)
-        else:
-            self.add_expand_slice(y,
-                                  buffer,
-                                  wb_t_all,
-                                  y_offset,
-                                  y_slice_size,
-                                  add_input=True)
-        y = y.view_as(y_org)
-
-    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
-                               lora_a_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               lora_b_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               scale: float,
-                               output_slices: Tuple[int, ...]) -> None:
-        """
-        Applies lora to each input. Similar to add_lora, This method is 
-        used for layers that are composed of multiple sublayers
-        (slices) packed together.
-        """
-        y_org = y
-        x = x.view(-1, x.shape[-1])
-        y = y.view(-1, y.shape[-1])
-        offset_left = 0
-        # TODO fuse these kernels
-        for slice_idx in range(len(output_slices)):
-            self.add_lora(y, x, lora_a_stacked[slice_idx],
-                          lora_b_stacked[slice_idx], scale, offset_left,
-                          output_slices[slice_idx])
-            offset_left += output_slices[slice_idx]
-
-        y = y.view_as(y_org)
-
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        wa_t_all: torch.Tensor,
-                        wb_t_all: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None) -> None:
-        """
-        LogitsProcessorWithLoRA always using bgmv
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default ,refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-
-        bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale)
-        bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True)
-        y = y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
new file mode 100644
index 0000000000000..48ada3926ea46
--- /dev/null
+++ b/vllm/lora/punica_wrapper/__init__.py
@@ -0,0 +1,7 @@
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
+
+__all__ = [
+    "PunicaWrapperBase",
+    "get_punica_wrapper",
+]
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
new file mode 100644
index 0000000000000..0a5a84bdd8deb
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -0,0 +1,480 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+
+from .utils import compute_meta, convert_mapping
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
+
+
+class PunicaWrapperABC(ABC):
+    """
+    PunicaWrapper ABC.
+    """
+
+    @abstractmethod
+    def update_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Update the lora-related metadata
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_shrink(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> None:
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        offset_start: int = 0,
+        add_input=True,
+        **kwargs,
+    ) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_input: bool = True,
+        **kwargs,
+    ) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA, 
+        and this layer only requires the expand operation.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        """
+        raise NotImplementedError
+
+
+class PunicaWrapperBase(PunicaWrapperABC):
+    """
+    PunicaWrapperBase is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        self._token_lora_indices = torch.empty(max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._sampler_indices = torch.empty(max_num_batched_tokens,
+                                            dtype=torch.long,
+                                            device=device)
+        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self._embeddings_indices = torch.empty(2,
+                                               max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._long_lora_indices = torch.empty(max_num_batched_tokens,
+                                              dtype=torch.long,
+                                              device=device)
+
+        # 5 is the number of indicies tensors.
+        # base_indices, sampler_indices, sampler_indices_padded,
+        # embeddings_indices,long_lora_indices
+        self.indices_len: List[Optional[int]] = [None] * 5
+        # these attributes are the information required for sgmv kernel
+        self._seq_start_locs = torch.empty(max_batches,
+                                           dtype=torch.long,
+                                           device=device)
+        self._seq_lengths = torch.empty(max_batches,
+                                        dtype=torch.long,
+                                        device=device)
+        self._lora_indices_per_batch = torch.empty(max_batches,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self.device: torch.device = device
+        self.max_length: int = 0
+        self.token_nums: int = 0
+        self.batch_size: int = -1
+        self.is_prefill = False
+        self.no_lora = False
+
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            self.device,
+            long_lora_context,
+        )
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
+                long_lora_offsets_tensor)
+        else:
+            self._long_lora_indices.zero_()
+        self.indices_len[:] = indices_len
+
+    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
+
+        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+         batch_size, max_length, token_nums,
+         no_lora) = compute_meta(token_lora_tensor)
+
+        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
+            b_seq_start_tensor)
+        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
+        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
+            lora_indices_tensor)
+        self.batch_size = batch_size
+        self.max_length = max_length
+        self.token_nums = token_nums
+        self.no_lora = no_lora
+
+    def _apply_bias(
+        self,
+        indices: torch.Tensor,
+        output: torch.Tensor,
+        output_slices: Tuple[int, ...],
+        lora_bias_stacked: Tuple[Optional[torch.Tensor], ...],
+    ):
+        """Applies bias to output
+
+        Input shapes:
+            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
+            indices:           (batch_size)
+            output:            (batch_size, q_slice_size + 2*kv_slice_size)
+            output_slices:     n-1 element tuple of (slice_size...),
+                            where n is number of slices
+        """
+        org_output = output
+        output = output.view(-1, output.shape[-1])
+        indices = indices.view(-1)
+
+        offset_left = 0
+        for slice_idx, slice in enumerate(output_slices):
+            bias = lora_bias_stacked[slice_idx]
+            if bias is not None:
+                bias = bias.view(-1, bias.shape[-1])
+                bias = bias[indices]
+                bias[indices == -1] = 0
+                output[:, offset_left:offset_left + slice] += bias
+            offset_left += slice
+
+        return output.view_as(org_output)
+
+    @property
+    def prefill_metadata(
+        self
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]:
+        """
+        This property provides a convenient way to access the necessary 
+        metadata for prefill-related  kernel computations.
+            1. seq_start_locs: Tensor of sequence start positions.
+            2. seq_lengths: Tensor of sequence lengths.
+            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
+                -1 means no lora should be applied.
+            4. batch_size: Batch size after clustering identical lora indices.
+            5. max_length: The maximum sequence length in the batch.
+            6. token_nums: The token numbers in the batch.
+        """
+        return (self._seq_start_locs[:self.batch_size],
+                self._seq_lengths[:self.batch_size],
+                self._lora_indices_per_batch[:self.batch_size],
+                self.batch_size, self.max_length, self.token_nums)
+
+    @property
+    def token_lora_indices(self) -> torch.Tensor:
+        """
+        This property provides the lora indices corresponding to each token 
+        in the batch. An index of -1 means no lora should be applied.
+        """
+        token_lora_len = self.indices_len[0]
+        return self._token_lora_indices[:token_lora_len]
+
+    @property
+    def sampler_indices(self) -> torch.Tensor:
+        """ 
+        This property is used to access the lora indices specifically for 
+        LogitsProcessorWithLoRA.
+        """
+        sampler_indices_len = self.indices_len[1]
+        return self._sampler_indices[:sampler_indices_len]
+
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices.
+        """
+        indices_padded_len = self.indices_len[2]
+        return self._sampler_indices_padded[:indices_padded_len]
+
+    @property
+    def embeddings_indices(self) -> torch.Tensor:
+        """
+        This property provides access to the indices used for lora embeddings, 
+        specifically for VocabParallelEmbeddingWithLoRA.
+        """
+        embeddings_indices_len = self.indices_len[3]
+        return self._embeddings_indices[:, :embeddings_indices_len]
+
+    @property
+    def long_lora_indices(self) -> torch.Tensor:
+        """ 
+        This property provides access to the indices used for long context 
+        lora, specifically for LinearScalingRotaryEmbeddingWithLora.
+        """
+        long_lora_len = self.indices_len[4]
+        return self._long_lora_indices[:long_lora_len]
+
+    def update_metadata(
+            self,
+            mapping: "LoRAMapping",
+            lora_index_to_id: List[Optional[int]],
+            max_loras: int,
+            vocab_size: int,
+            extra_vocab_size: int,
+            long_lora_context: Optional["LongContextLoRAContext"] = None,
+            **kwargs):
+
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size,
+                                   long_lora_context)
+        if mapping.is_prefill:
+            # Update metadata required for prefill-related operators.
+            self._update_prefill_metada(self.token_lora_indices)
+            self.is_prefill = True
+        else:
+            self.is_prefill = False
+
+    @abstractmethod
+    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
+                   scale: float, **kwargs) -> None:
+        """
+        Performs GEMM  for multiple slices of lora_a.
+
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: Tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                   output_slices: Tuple[int, ...],
+                   offset_start: int = 0,
+                   add_input=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (Tuple[int, ...]): Every slice's size
+            add_input (bool):  Defaults to True.
+
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_input: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+        and this layer only requires the expand operation.
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_input (bool): Default to True.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (Tuple[int, ...]): Every slice's size.
+            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
new file mode 100644
index 0000000000000..b2af29de129ce
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -0,0 +1,358 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Callable, Optional, Tuple, Union, final
+
+import torch
+
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.lora.ops.bgmv_expand import bgmv_expand
+    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+    from vllm.lora.ops.sgmv_expand import sgmv_expand
+    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperGPU(PunicaWrapperBase):
+    """
+    PunicaWrapperGPU is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica triton kernel.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+
+    def _shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def _shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def _expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_input,
+        )
+
+    def _expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
+
+    def _expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_input,
+        )
+
+    def _expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_input)
+
+    def _apply_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool = True,
+    ):
+        """
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
+        computation, which is suitable for the
+        GEMM of lora'b.
+        """
+
+        expand_slice_fun: Callable = (self._expand_slice_prefill
+                                      if self.is_prefill else
+                                      self._expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+
+    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                      w_t_all: torch.Tensor, scale: float):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        shrink_fun: Callable = (self._shrink_prefill
+                                if self.is_prefill else self._shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
+                   scale: float, **kwargs):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        x = x.view(-1, x.shape[-1])
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_a_stacked)):
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                               scale)
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: Tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                   output_slices: Tuple[int, ...],
+                   offset_start: int = 0,
+                   add_input=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (Tuple[int, ...]): Every slice's size
+            add_input (bool):  Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        offset_left = offset_start
+        if lora_bias_stacked is not None:
+            self._apply_bias(self.token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
+        for slice_idx in range(len(lora_b_stacked)):
+            self._apply_expand(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_left,
+                output_slices[slice_idx],
+                add_input=add_input,
+            )
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_input: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_input (bool): Default to True.
+        """
+
+        # Embedding layer only need expand op
+        expand_fun: Callable = (self._expand_prefill
+                                if self.is_prefill else self._expand_decode)
+        expand_fun(y, x, lora_b_stacked, add_input)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (Tuple[int, ...]): Every slice's size.
+            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = tuple(
+                torch.zeros(
+                    (x.size(0), r), dtype=torch.float32, device=x.device)
+                for _ in range(len(output_slices)))
+        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
+        self.add_expand(y,
+                        buffer,
+                        lora_b_stacked,
+                        None,
+                        output_slices,
+                        add_input=True,
+                        **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = lora_b_stacked.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+        # LogitsProcessorWithLoRA always using bgmv.
+        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer,
+                    lora_b_stacked,
+                    y,
+                    self.sampler_indices,
+                    add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
new file mode 100644
index 0000000000000..df6c1bdc7dd71
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -0,0 +1,14 @@
+from vllm.platforms import current_platform
+from vllm.utils import print_info_once
+
+from .punica_base import PunicaWrapperBase
+
+
+def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
+    if current_platform.is_cuda_alike():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
+        print_info_once("Using PunicaWrapperGPU.")
+        return PunicaWrapperGPU(*args, **kwargs)
+    else:
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
new file mode 100644
index 0000000000000..7360c8c09e3ac
--- /dev/null
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -0,0 +1,159 @@
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
+
+
+def compute_meta(
+    token_lora_tensor: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
+    """
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
+    performance.
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
+    """
+
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
+    token_nums = seq_length_tensor.sum().item()
+    batch_size = lora_indices_tensor.size(0)
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, token_nums, no_lora)
+
+
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: List[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    device: torch.device,
+    long_lora_context: Optional["LongContextLoRAContext"] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor], List[int]]:
+    """Converts LoRAMapping to index tensors.
+
+    Args:
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+        long_lora_context: Passed if there are long context lora in a batch.
+
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indicies. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indicies, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            long_lora_indices: Tensor of shape [batch_size] mapping
+                requests to RoPE offsets and rot dims for long LoRAs.
+                None if long context lora doesn't exist.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices, long_lora_indices).
+    """
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    long_lora_offsets: Optional[torch.Tensor] = None
+    if long_lora_context:
+        long_lora_offsets = torch.zeros(len(index_mapping_indices),
+                                        device=device,
+                                        dtype=torch.long)
+    prompt_mapping: List[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+        if long_lora_context:
+            assert long_lora_offsets is not None
+            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                index_mapping_indices[i], 0)
+            long_lora_offsets[i] = lora_offset
+
+    indices_list: List[Union[List[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+    if long_lora_context:
+        assert long_lora_offsets is not None
+        indices_list.append(long_lora_offsets)
+    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         dtype=torch.long,
+                                         device=device)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices[embeddings_indices == -1] = max_loras - 1
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+    long_lora_indices = None
+    long_lora_indices_len: Optional[int] = None
+    if long_lora_context:
+        long_lora_indices = indices[3]
+        long_lora_indices_len = long_lora_indices.shape[-1]
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+    if long_lora_indices_len is not None:
+        indices_len.append(long_lora_indices_len)
+    else:
+        # If long_lora doesn't exist,append None
+        indices_len.append(None)
+
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        long_lora_indices,
+        indices_len,
+    )
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index d7b67425fcbc0..e631aec928ec5 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,14 +1,96 @@
-from typing import Optional
+from __future__ import annotations
 
-from vllm.logits_process import LogitsProcessor
-from vllm.sampling_params import GuidedDecodingParams
+from typing import TYPE_CHECKING
+
+from vllm.logger import init_logger
+from vllm.platforms import CpuArchEnum, current_platform
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from vllm.config import ModelConfig
+    from vllm.logits_process import LogitsProcessor
+    from vllm.sampling_params import GuidedDecodingParams
+
+logger = init_logger(__name__)
+
+
+def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj for key in [
+                    "minimum", "maximum", "exclusiveMinimum",
+                    "exclusiveMaximum", "multipleOf"
+                ]):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def maybe_backend_fallback(
+        guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
+    # lm-format-enforce doesn't support grammar, fallback to xgrammar
+    if (guided_params.backend == "lm-format-enforcer"
+            and guided_params.grammar is not None):
+        logger.warning(
+            "lm-format-enforcer does not support grammar guided decoding. "
+            "Falling back to use xgrammar instead.")
+        guided_params.backend = "xgrammar"
+
+    if guided_params.backend == "xgrammar":
+        # xgrammar only has x86 wheels for linux, fallback to outlines
+        if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
+            logger.warning("xgrammar is only supported on x86 CPUs. "
+                           "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
+        # xgrammar doesn't support regex or choice, fallback to outlines
+        if guided_params.regex is not None or guided_params.choice is not None:
+            logger.warning(
+                "xgrammar only supports json or grammar guided decoding. "
+                "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
+        # xgrammar doesn't support some JSON schema features
+        elif (guided_params.json is not None
+              and has_xgrammar_unsupported_json_features(guided_params.json)):
+            logger.warning(
+                "xgrammar does not support advanced JSON schema features like "
+                "patterns or numeric ranges. "
+                "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
+    return guided_params
 
 
 async def get_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer) -> Optional[LogitsProcessor]:
+        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig) -> LogitsProcessor | None:
+    guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend == 'outlines' or guided_params.grammar:
+    if guided_params.backend == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_outlines_guided_decoding_logits_processor)
@@ -19,17 +101,23 @@ async def get_guided_decoding_logits_processor(
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
+    if guided_params.backend == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
 
 
 def get_local_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer) -> Optional[LogitsProcessor]:
+        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig) -> LogitsProcessor | None:
+    guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend == 'outlines' or guided_params.grammar:
+    if guided_params.backend == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_local_outlines_guided_decoding_logits_processor)
@@ -40,7 +128,12 @@ def get_local_guided_decoding_logits_processor(
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
+    if guided_params.backend == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
new file mode 100644
index 0000000000000..b59a2269d2cd5
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -0,0 +1,266 @@
+# noqa: UP007
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, NamedTuple
+
+import torch
+from transformers import PreTrainedTokenizerFast
+
+try:
+    import xgrammar as xgr
+    from xgrammar.base import _core as xgr_core
+except ImportError:
+    pass
+
+from vllm.model_executor.guided_decoding.xgrammar_utils import (
+    convert_lark_to_gbnf, grammar_is_likely_lark)
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from vllm.config import ModelConfig
+    from vllm.sampling_params import GuidedDecodingParams
+
+
+# TODO: passing batch size to max threads here
+def get_local_xgrammar_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        max_threads: int = 8):
+    config = GrammarConfig.from_guided_params(guided_params=guided_params,
+                                              model_config=model_config,
+                                              tokenizer=tokenizer,
+                                              max_threads=max_threads)
+    return XGrammarLogitsProcessor(config)
+
+
+class TokenizerData(NamedTuple):
+    """Immutable container for cached tokenizer data."""
+    encoded_vocab: list[str]
+    stop_token_ids: list[int] | None
+    backend_str: str
+
+
+class TokenizerDataCache:
+    """Cache manager for tokenizer data to avoid repeated processing."""
+    _cache: dict[int, TokenizerData] = {}
+
+    @classmethod
+    def get_tokenizer_data(cls,
+                           tokenizer: PreTrainedTokenizer) -> TokenizerData:
+        tokenizer_hash = hash(tokenizer)
+
+        if tokenizer_hash not in cls._cache:
+            # Vendored from xgrammar logic since we cannot pickle the tokenizer
+            # https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
+            try:
+                encoded_vocab = [
+                    token for token, _ in sorted(tokenizer.get_vocab().items(),
+                                                 key=lambda x: x[1])
+                ]
+            except AttributeError as e:
+                raise ValueError(
+                    f"Cannot get the vocabulary of the tokenizer "
+                    f"{type(tokenizer)}. The tokenizer should have a "
+                    "get_vocab method.") from e
+
+            stop_token_ids = None
+            backend_str = xgr.VocabType.RAW
+            if isinstance(tokenizer, PreTrainedTokenizerFast):
+                backend_str = tokenizer.backend_tokenizer.to_str()
+                if stop_token_ids is None and hasattr(
+                        tokenizer,
+                        "eos_token_id") and tokenizer.eos_token_id is not None:
+                    stop_token_ids = [tokenizer.eos_token_id]
+
+            cls._cache[tokenizer_hash] = TokenizerData(
+                encoded_vocab=encoded_vocab,
+                stop_token_ids=stop_token_ids,
+                backend_str=backend_str)
+
+        return cls._cache[tokenizer_hash]
+
+
+class GrammarCompilerCache:
+    """
+    Cache for GrammarCompiler instances based on tokenizer.
+
+    This cache reduces the overhead of creating new compiler instances when
+    using the same tokenizer configuration.
+    """
+    _cache: dict[str, xgr.GrammarCompiler] = {}
+
+    @classmethod
+    def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
+        cache_key = str(config.tokenizer_hash)
+
+        if cache_key not in cls._cache:
+            assert config.encoded_vocab is not None
+            tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                xgr_core.TokenizerInfo.from_huggingface(
+                    config.encoded_vocab, config.backend_str,
+                    config.vocab_size, config.stop_token_ids))
+            cls._cache[cache_key] = xgr.GrammarCompiler(
+                tokenizer_info, max_threads=config.max_threads)
+
+        return cls._cache[cache_key]
+
+
+@dataclass
+class GrammarConfig:
+    """Serializable configuration for grammar compilation"""
+    tokenizer_hash: int
+    vocab_size: int
+    json_str: str | None = None
+    grammar_str: str | None = None
+    json_object: bool | None = None
+    max_threads: int = 8
+    # Only populated if tokenizer_hash not in cache
+    encoded_vocab: list[str] | None = None
+    stop_token_ids: list[int] | None = None
+    backend_str: str | None = None
+
+    @classmethod
+    def from_guided_params(cls,
+                           guided_params: GuidedDecodingParams,
+                           model_config: ModelConfig,
+                           tokenizer: PreTrainedTokenizer,
+                           max_threads: int = 8) -> GrammarConfig:
+
+        tokenizer_hash = hash(tokenizer)
+        # Only get tokenizer data if not already cached
+        if tokenizer_hash in TokenizerDataCache._cache:
+            encoded_vocab = None
+            stop_token_ids = None
+            backend_str = None
+        else:
+            tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
+            encoded_vocab = tokenizer_data.encoded_vocab
+            stop_token_ids = tokenizer_data.stop_token_ids
+            backend_str = tokenizer_data.backend_str
+
+        if guided_params.json:
+            if not isinstance(guided_params.json, str):
+                json_str = json.dumps(guided_params.json)
+            else:
+                json_str = guided_params.json
+            return cls(json_str=json_str,
+                       vocab_size=model_config.hf_config.vocab_size,
+                       encoded_vocab=encoded_vocab,
+                       stop_token_ids=stop_token_ids,
+                       backend_str=backend_str,
+                       tokenizer_hash=tokenizer_hash,
+                       max_threads=max_threads)
+        elif guided_params.grammar:
+            # XGrammar only supports GBNF grammars, so we must convert Lark
+            if grammar_is_likely_lark(guided_params.grammar):
+                try:
+                    grammar_str = convert_lark_to_gbnf(guided_params.grammar)
+                except ValueError as e:
+                    raise ValueError(
+                        "Failed to convert the grammar from Lark to GBNF. "
+                        "Please either use GBNF grammar directly or specify"
+                        " --guided-decoding-backend=outlines.\n"
+                        f"Conversion error: {str(e)}") from e
+            else:
+                grammar_str = guided_params.grammar
+            return cls(grammar_str=grammar_str,
+                       vocab_size=model_config.hf_config.vocab_size,
+                       encoded_vocab=encoded_vocab,
+                       stop_token_ids=stop_token_ids,
+                       backend_str=backend_str,
+                       tokenizer_hash=tokenizer_hash,
+                       max_threads=max_threads)
+        elif guided_params.json_object:
+            return cls(json_object=True,
+                       vocab_size=model_config.hf_config.vocab_size,
+                       encoded_vocab=encoded_vocab,
+                       stop_token_ids=stop_token_ids,
+                       backend_str=backend_str,
+                       tokenizer_hash=tokenizer_hash,
+                       max_threads=max_threads)
+        else:
+            raise ValueError(
+                "Currently only support JSON and EBNF grammar mode for xgrammar"
+            )
+
+
+@dataclass
+class XGrammarLogitsProcessor:
+    """Wrapper class to support pickle protocol"""
+    config: GrammarConfig
+
+    ctx: xgr.CompiledGrammar | None = None
+    token_bitmask: torch.Tensor = None  # type: ignore[assignment]
+    matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
+    batch_size: int = field(default=1)
+    prefilled: bool = field(default=False)
+
+    def __getstate__(self) -> dict[str, Any]:
+        return {'config': self.config}
+
+    def __setstate__(self, state: dict[str, Any]):
+        self.config = state['config']
+
+        self.ctx = None
+        self.matchers = []
+        self.batch_size = 1
+        self.token_bitmask = None  # type: ignore[assignment]
+        self.prefilled = False
+
+    def _ensure_ctx(self):
+        """Lazily initialize the processor in the worker process"""
+        if self.ctx is None:
+            compiler = GrammarCompilerCache.get_compiler(self.config)
+            if self.config.json_str is not None:
+                self.ctx = compiler.compile_json_schema(self.config.json_str)
+            elif self.config.grammar_str is not None:
+                self.ctx = compiler.compile_grammar(self.config.grammar_str)
+            elif self.config.json_object:
+                self.ctx = compiler.compile_builtin_json_grammar()
+            else:
+                raise ValueError(
+                    "Invalid configuration for xgrammar logits processor")
+
+    def __call__(self, input_ids: list[int],
+                 scores: torch.Tensor) -> torch.Tensor:
+        if self.ctx is None:
+            self._ensure_ctx()
+
+        if len(self.matchers) == 0:
+            self.matchers = [
+                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
+            ]
+            self.token_bitmask = xgr.allocate_token_bitmask(
+                self.batch_size, self.config.vocab_size)
+
+        if not self.prefilled:
+            # Have not sampled a token yet
+            self.prefilled = True
+        else:
+            for i, matcher in enumerate(self.matchers):
+                if not matcher.is_terminated():
+                    sampled_token = input_ids[-1]
+                    assert self.matchers[i].accept_token(sampled_token)
+
+        for i, matcher in enumerate(self.matchers):
+            if not matcher.is_terminated():
+                # @ubospica: ideally, fill_next_token_bitmask should be
+                # parallelized with model decoding
+                # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
+                matcher.fill_next_token_bitmask(self.token_bitmask, i)
+
+        # token_bitmask is a CPU tensor for use with accept_token and
+        # fill_next_token_bitmask so we move it to the device of scores
+        device_type = scores.device.type
+        if device_type != "cuda":
+            scores = scores.to("cpu")
+        xgr.apply_token_bitmask_inplace(scores,
+                                        self.token_bitmask.to(scores.device))
+        if device_type != "cuda":
+            scores = scores.to(device_type)
+
+        return scores
diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/xgrammar_utils.py
new file mode 100644
index 0000000000000..12b42245f4e3d
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/xgrammar_utils.py
@@ -0,0 +1,162 @@
+import re
+
+
+def grammar_is_likely_lark(grammar_str: str) -> bool:
+    """
+    Check if grammar appears to use Lark syntax.
+    
+    Args:
+        grammar_str: Input grammar string
+        
+    Returns:
+        bool: True if grammar appears to be in Lark format, False otherwise
+        
+    Examples:
+        >>> grammar_is_likely_lark("rule: 'abc'")
+        True
+        >>> grammar_is_likely_lark("rule ::= 'abc'")
+        False
+    """
+    if not grammar_str or not isinstance(grammar_str, str):
+        return False
+
+    for line in grammar_str.split('\n'):
+        # Remove both comment styles
+        line = re.sub(r'(#|//).*$', '', line).strip()
+        if not line:
+            continue
+
+        # Look for Lark-style rule definitions
+        if ':' in line and '::=' not in line:
+            return True
+
+        # Look for Lark-specific features
+        if any(pattern in line for pattern in ['?start:', '|', '~']):
+            return True
+
+    return False
+
+
+def convert_lark_to_gbnf(grammar_str: str) -> str:
+    """
+    Convert a Lark grammar string to GBNF format.
+
+    GBNF reference:
+    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+    Lark grammar reference:
+    https://lark-parser.readthedocs.io/en/latest/grammar.html
+    
+    Args:
+        grammar_str: Input grammar in Lark format
+        
+    Returns:
+        str: Converted grammar in GBNF format
+        
+    Examples:
+        >>> print(convert_lark_to_gbnf("rule: 'hello'"))
+        root ::= rule
+        rule ::= "hello"
+    """
+    if not isinstance(grammar_str, str):
+        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
+    if not grammar_str.strip():
+        raise ValueError("Grammar string cannot be empty")
+
+    defined_rules = set()
+    referenced_rules = set()
+    output_lines = []
+
+    def clean_line(line: str) -> str:
+        """Remove comments and whitespace from line."""
+        return re.sub(r'(#|//).*$', '', line).strip()
+
+    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
+        """Validate quote matching in text."""
+        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
+            raise ValueError(
+                f"Mismatched quotes in {rule_name} on line {line_num}")
+
+    def extract_references(text: str) -> set:
+        """Extract rule references from text."""
+        # Remove quoted strings and special characters
+        text = re.sub(r'"[^"]*"', '', text)
+        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
+        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
+
+    # First pass: Find root rule and validate rule definitions
+    lines = [clean_line(line) for line in grammar_str.split('\n')]
+    first_rule = None
+
+    for line_num, line in enumerate(lines, 1):
+        if not line or line.startswith('|'):
+            continue
+
+        if ':' in line:
+            try:
+                name = line.split(':', 1)[0].strip().strip('?')
+                defined_rules.add(name)
+                if first_rule is None:
+                    first_rule = name
+                if name == 'start':
+                    first_rule = 'start'
+            except IndexError as e:
+                raise ValueError(f"Invalid rule format on line {line_num}. "
+                                 "Expected 'rule_name: definition'") from e
+
+    if not defined_rules:
+        raise ValueError("No valid rules found in grammar")
+
+    # Add root rule
+    output_lines.append(f"root ::= {first_rule}")
+
+    # Second pass: Process rule definitions and alternatives
+    current_rule = None
+    current_definition = []
+
+    for line_num, line in enumerate(lines, 1):
+        if not line:
+            continue
+
+        try:
+            if ':' in line and not line.startswith('|'):
+                # Save previous rule if exists
+                if current_rule:
+                    output_lines.append(
+                        f"{current_rule} ::= {' | '.join(current_definition)}")
+
+                # Process new rule
+                name, definition = line.split(':', 1)
+                current_rule = name.strip().strip('?')
+
+                check_quotes(definition, f"rule '{current_rule}'", line_num)
+                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
+                referenced_rules.update(extract_references(definition))
+                current_definition = [definition.strip()]
+
+            elif line.startswith('|'):
+                if not current_rule:
+                    raise ValueError(f"Alternative '|' on line {line_num} "
+                                     "without a preceding rule definition")
+
+                alt_def = line[1:].strip()
+                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
+                             line_num)
+                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
+                referenced_rules.update(extract_references(alt_def))
+                current_definition.append(alt_def)
+
+        except ValueError as e:
+            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
+
+    # Add final rule if exists
+    if current_rule:
+        output_lines.append(
+            f"{current_rule} ::= {' | '.join(current_definition)}")
+
+    # Validate all rules are defined
+    undefined_rules = referenced_rules - defined_rules - {'root'}
+    if undefined_rules:
+        raise ValueError("Referenced rules are not defined: "
+                         f"{', '.join(sorted(undefined_rules))}")
+
+    return '\n'.join(output_lines)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 0e12bc5691538..fdc4c6305bd5e 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -6,7 +6,6 @@
 import glob
 import inspect
 import itertools
-import json
 import math
 import os
 import warnings
@@ -18,7 +17,7 @@
 import huggingface_hub
 import numpy as np
 import torch
-from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub import HfApi
 from torch import nn
 from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
@@ -102,12 +101,10 @@ def _initialize_model(
     vllm_config: VllmConfig,
     *,
     prefix: str = "",
-    architectures: Optional[list[str]] = None,
 ) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_config = vllm_config.model_config
-    model_class, _ = get_model_architecture(model_config,
-                                            architectures=architectures)
+    model_class, _ = get_model_architecture(model_config)
 
     signatures = inspect.signature(model_class.__init__)
     all_params = [param.name for param in signatures.parameters.values()]
@@ -704,51 +701,9 @@ def __init__(self, load_config: LoadConfig):
         self.unsharded_weights_modules: List[str] = []
         # Save the module names that are sharded by column.
         self.column_sharded_weights_modules: List[str] = []
-        # we don't need to quantize the whole model, only the target modules
-        # that are specified in the adapter config file. If the adapter config
-        # file is not provided, we will quantize the default modules.
-        if (not load_config.model_loader_extra_config
-                or "qlora_adapter_name_or_path"
-                not in load_config.model_loader_extra_config):
-            self.target_modules = []
-            return
-
-        qlora_adapter = load_config.model_loader_extra_config[
-            "qlora_adapter_name_or_path"]
-
-        config_file_path = self._get_config_file(qlora_adapter)
-
-        with open(config_file_path) as f:
-            config = json.load(f)
-            self.target_modules = config["target_modules"]
-            # TODO: target_modules could be either a list or a regex string.
-            # We need to handle both cases.
-            assert isinstance(self.target_modules,
-                              list), "Unsupported target_modules: "
-            f"{self.target_modules}"
-
-    def _get_config_file(self, qlora_adapter: str) -> str:
-        is_local = os.path.isdir(qlora_adapter)
-        config_file_path = None
-        if is_local:
-            for file in self.possible_config_file_names:
-                config_file_path = os.path.join(qlora_adapter, file)
-                if os.path.exists(config_file_path):
-                    break
-        else:
-            hf_api = HfApi()
-            repo_files = hf_api.list_repo_files(repo_id=qlora_adapter)
-            for file in self.possible_config_file_names:
-                if file in repo_files:
-                    config_file_path = hf_hub_download(repo_id=qlora_adapter,
-                                                       filename=file)
-                    break
-
-        if not config_file_path:
-            raise ValueError(
-                f"Cannot find adapter config file in {qlora_adapter}")
-
-        return config_file_path
+        # Store all module names (from transformers) that support
+        # BNB quantization.
+        self.target_modules: List[str] = []
 
     def _get_weight_files(
         self,
@@ -1030,25 +985,16 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
                 inverse_stacked_mapping[packed] = []
             inverse_stacked_mapping[packed].insert(idx, orig)
 
-        linear_module_lst = []
         for name, module in model.named_modules():
             if isinstance(module, (LinearBase, )):
                 last_name = name.split(".")[-1]
                 if sub_modules := inverse_stacked_mapping.get(last_name, []):
                     # Map vllm's names to transformers' names.
                     for sub_name in sub_modules:
-                        linear_module_lst.append(
+                        self.target_modules.append(
                             name.replace(last_name, sub_name))
                 else:
-                    linear_module_lst.append(name)
-        if self.target_modules:
-            # Update self.target_modules
-            self.target_modules = [
-                qual_name for qual_name in linear_module_lst
-                if any(t in qual_name for t in self.target_modules)
-            ]
-        else:
-            self.target_modules = linear_module_lst
+                    self.target_modules.append(name)
         assert (self.target_modules
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"
@@ -1120,7 +1066,14 @@ def _load_weights(self, model_config: ModelConfig,
                                                  model_config.revision,
                                                  pre_quant, load_8bit))
 
-        model.load_weights(qweight_iterator)
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(qweight_iterator)
+        # Some models may have weights loading tracker unimplemented.
+        if loaded_weights is not None:
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError("Following weights were not initialized from "
+                                 f"checkpoint: {weights_not_loaded}")
 
         torch.cuda.empty_cache()
 
@@ -1152,9 +1105,10 @@ def _load_weights(self, model_config: ModelConfig,
                         shard_name, weight_name)
                     break
 
+            # Models like Clip/Siglip may skip some layers in initialization,
+            # causing unused quant_param_name in state_dict.
             if quant_param_name not in param_dict:
-                raise ValueError(
-                    f"Parameter {quant_param_name} not found in the model.")
+                continue
 
             if quant_param_name not in stacked_quant_state_dict:
                 stacked_quant_state_dict[quant_param_name] = {}
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 864dd04e79921..cfb89e0f336bc 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,6 +1,6 @@
 """Utilities for selecting and loading models."""
 import contextlib
-from typing import Optional, Tuple, Type
+from typing import Tuple, Type
 
 import torch
 from torch import nn
@@ -20,12 +20,8 @@ def set_default_torch_dtype(dtype: torch.dtype):
 
 
 def get_model_architecture(
-    model_config: ModelConfig,
-    *,
-    architectures: Optional[list[str]] = None,
-) -> Tuple[Type[nn.Module], str]:
-    if architectures is None:
-        architectures = getattr(model_config.hf_config, "architectures", [])
+        model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
+    architectures = getattr(model_config.hf_config, "architectures", [])
 
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index fa6b95f5481ad..dd4b0c75cb84d 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -32,9 +32,8 @@
                                               maybe_prefix,
                                               merge_multimodal_embeddings)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors
@@ -451,7 +450,7 @@ def get_max_multimodal_tokens(ctx):
 
 
 def input_mapper_for_aria(ctx, data):
-    return MultiModalInputs(data)
+    return MultiModalKwargs(data)
 
 
 def input_processor(ctx, llm_inputs):
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 6af59697160a0..42a239cadac46 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -4,11 +4,10 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
-from vllm.attention.selector import _Backend
+from vllm.attention.layer import MultiHeadAttention
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import DecoderOnlyInputs, token_inputs
@@ -22,8 +21,6 @@
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .utils import get_vit_attn_backend
-
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -205,11 +202,8 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        # Detect attention implementation.
-        self.attn_backend = get_vit_attn_backend(support_fa=False)
-        if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}:
-            raise RuntimeError(
-                f"BLIP does not support {self.attn_backend} backend now.")
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,
@@ -220,41 +214,10 @@ def forward(
         hidden_states: torch.Tensor,
     ):
         """Input shape: Batch x Time x Channel"""
-        bsz, tgt_len, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
-        query_states = query_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(bsz, tgt_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-
-        if self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(query_states,
-                                                          key_states,
-                                                          value_states,
-                                                          p=self.dropout,
-                                                          scale=self.scale)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
-            query_states, key_states, value_states = (x.transpose(1, 2)
-                                                      for x in (query_states,
-                                                                key_states,
-                                                                value_states))
-            out = F.scaled_dot_product_attention(query_states,
-                                                 key_states,
-                                                 value_states,
-                                                 dropout_p=self.dropout,
-                                                 scale=self.scale)
-            out = out.transpose(1, 2)
-
-        out = out.view(bsz, tgt_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.projection(out)
 
         return attn_output, None
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index cd89519e95986..a5300dfd986f3 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -5,11 +5,10 @@
 import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from PIL import Image
 from transformers import CLIPVisionConfig
 
-from vllm.attention.selector import _Backend
+from vllm.attention.layer import MultiHeadAttention
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import DecoderOnlyInputs, token_inputs
@@ -25,8 +24,6 @@
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
-from .utils import get_vit_attn_backend
-
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -235,11 +232,8 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        # Detect attention implementation.
-        self.attn_backend = get_vit_attn_backend(support_fa=False)
-        if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}:
-            raise RuntimeError(
-                f"CLIP does not support {self.attn_backend} backend now.")
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,
@@ -250,42 +244,10 @@ def forward(
         hidden_states: torch.Tensor,
     ):
         """Input shape: Batch x Time x Channel"""
-        bsz, tgt_len, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
-
-        query_states = query_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(bsz, tgt_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-
-        if self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(query_states,
-                                                          key_states,
-                                                          value_states,
-                                                          p=self.dropout,
-                                                          scale=self.scale)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
-            query_states, key_states, value_states = (x.transpose(1, 2)
-                                                      for x in (query_states,
-                                                                key_states,
-                                                                value_states))
-            out = F.scaled_dot_product_attention(query_states,
-                                                 key_states,
-                                                 value_states,
-                                                 dropout_p=self.dropout,
-                                                 scale=self.scale)
-            out = out.transpose(1, 2)
-
-        out = out.view(bsz, tgt_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
 
         return attn_output, None
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 5ca26d53a17e7..0398f0943a70a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -473,10 +473,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index b2557f67edbda..5db9786ea05b7 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -8,6 +8,7 @@
 from torch import nn
 from torch.nn import LayerNorm
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -78,6 +79,8 @@ def __init__(
             quant_config=quant_config,
         )
 
+        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
+                                       self.scale)
         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -105,24 +108,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             except ModuleNotFoundError:
                 pass
 
-        B, L, _ = x.shape
         qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
         q, k, v = qkv.chunk(3, dim=-1)
-        q = q.reshape(B, L, self.num_heads_per_rank,
-                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
-        k = k.reshape(B, L, self.num_heads_per_rank,
-                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
-        v = v.reshape(B, L, self.num_heads_per_rank,
-                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
-
-        out = torch.nn.functional.scaled_dot_product_attention(q,
-                                                               k,
-                                                               v,
-                                                               attn_mask=None,
-                                                               dropout_p=0.,
-                                                               is_causal=False)
-
-        output, _ = self.dense(out.transpose(1, 2).view(B, L, -1))
+
+        out = self.attn(q, k, v)
+        output, _ = self.dense(out)
         output = self.output_dropout(output)
         return output
 
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index bd2394e71c973..f9e0443b9a508 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -400,16 +400,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 self.lm_head.weight = self.model.embed_tokens.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-
             if hasattr(config, "logits_scaling"):
                 logit_scale /= config.logits_scaling
+
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     scale=logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
+        self.sampler = get_sampler()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 16192928beb1f..e430a158d869a 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -21,8 +21,8 @@
 from torch import nn
 from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2Config, Idefics2VisionConfig)
-from xformers import ops as xops
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -141,35 +141,18 @@ def __init__(
         )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
-        self.is_causal = False
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-        batch_size, q_len, _ = hidden_states.size()
         qkv, _ = self.qkv_proj(
             hidden_states
         )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
         query_states, key_states, value_states = qkv.chunk(3, dim=-1)
-        query_states = query_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(batch_size, q_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        # see: https://facebookresearch.github.io/xformers/components/ops.html
-        out = xops.memory_efficient_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            p=self.dropout,
-            scale=self.scale,
-        )
-        out = out.view(batch_size, q_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
         return attn_output
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 01a381381ccec..c3979eab905db 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -36,6 +36,11 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
         """
         Returns multimodal embeddings generated from multimodal kwargs 
         to be merged with text embeddings.
+
+        The output embeddings must be one of the following formats:
+        - A list or tuple of 2D tensors, where each tensor corresponds to 
+          each input image.
+        - A single 3D tensor, with the batch dimension grouping the 2D tensors.
         """
         ...
 
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index c4346fcb3bd2a..7ff68bd60e8ad 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -12,7 +12,7 @@
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.attention.selector import _Backend
+from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -25,8 +25,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-from .utils import get_vit_attn_backend
-
 NORM2FN = {
     'rms_norm': RMSNorm,
     'layer_norm': nn.LayerNorm,
@@ -183,10 +181,8 @@ def __init__(
             prefix=f"{prefix}.proj",
         )
 
-        self.attn_backend = get_vit_attn_backend(support_fa=False)
-        if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}:
-            raise RuntimeError(
-                f"InternViT does not support {self.attn_backend} backend now.")
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
     def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
         if self.tp_size > 1:
@@ -209,23 +205,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.qk_normalization:
             q, k = self._apply_qk_norm(q, k)
 
-        q = q.view(B, N, self.num_heads_per_partition, self.head_dim)
-        k = k.view(B, N, self.num_heads_per_partition, self.head_dim)
-        v = v.view(B, N, self.num_heads_per_partition, self.head_dim)
-
-        if self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(q,
-                                                          k,
-                                                          v,
-                                                          scale=self.scale)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
-            q, k, v = (x.transpose(1, 2) for x in (q, k, v))
-            out = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
-            out = out.transpose(1, 2)
-
-        out = out.view(B, N, -1)
+        out = self.attn(q, k, v)
         out, _ = self.proj(out)
         return out
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 86aab38032450..42c769f79e202 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -26,7 +26,7 @@
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -52,12 +52,18 @@ class InternVLImagePixelInputs(TypedDict):
     Shape:
     `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
     """
+    patches_per_image: List[int]
+    """
+    List of number of total patches for each image in the batch.
+    """
 
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    data: NestedTensors
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
@@ -349,10 +355,32 @@ def input_processor(
         new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
                                                num_patches)
         new_prompt_token_ids = tokenizer.encode(new_prompt)
+        img_context_token_id = tokenizer.encode(self.img_context_token,
+                                                add_special_tokens=False)
+        assert len(img_context_token_id) == 1, \
+            (f"Invalid image token '{self.img_context_token}': A valid image "
+            f"token encodes to a single token ID, got {img_context_token_id}.")
+        img_context_token_id = img_context_token_id[0]
+
+        # Get precise tracking of placeholder positions
+        token_idx = image_idx = 0
+        placeholder_ranges = []
+        while token_idx < len(new_prompt_token_ids):
+            if new_prompt_token_ids[token_idx] == img_context_token_id:
+                curr_image_featue_size = image_feature_sizes[image_idx]
+                placeholder_ranges.append(
+                    PlaceholderRange(offset=token_idx,
+                                     length=curr_image_featue_size))
+                image_idx += 1
+                token_idx += curr_image_featue_size
+            else:
+                token_idx += 1
 
-        return token_inputs(prompt=prompt,
-                            prompt_token_ids=new_prompt_token_ids,
-                            multi_modal_data=multi_modal_data)
+        return token_inputs(
+            prompt=prompt,
+            prompt_token_ids=new_prompt_token_ids,
+            multi_modal_data=multi_modal_data,
+            multi_modal_placeholders={"image": placeholder_ranges})
 
     def input_mapper(
         self,
@@ -482,6 +510,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.mlp1 = self._init_mlp1(config)
 
         self.img_context_token_id = None
+        self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
@@ -613,35 +642,54 @@ def _parse_and_validate_image_input(
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
+
+            patches_per_image = []
+            for request_pixel_values in pixel_values:
+                for image_pixel_values in request_pixel_values:
+                    patches_per_image.append(image_pixel_values.shape[0])
             # We need to flatten (B, N, P) to (B*N*P),
             # so we call flatten_bn twice.
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
                     flatten_bn(flatten_bn(pixel_values), concat=True)),
-            )
+                patches_per_image=patches_per_image)
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_image_input(
         self,
         image_input: InternVLImageInputs,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
         assert self.vision_model is not None
+
         image_embeds = self.extract_feature(image_input["data"])
 
+        patches_per_image = image_input["patches_per_image"]
+        if len(patches_per_image) == 1:
+            image_embeds = image_embeds.unsqueeze(0)
+            return image_embeds
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in patches_per_image
+        ]
+        image_embeds = image_embeds.split(image_feature_sizes)
         return image_embeds
 
-    def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
         if self.is_mono:
-            visual_token_mask = (
+            self.visual_token_mask = (
                 input_ids == self.img_context_token_id).reshape(-1, 1)
         else:
-            visual_token_mask = None
-        return visual_token_mask
+            self.visual_token_mask = None
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
@@ -658,6 +706,7 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             assert self.img_context_token_id is not None
+            self._set_visual_token_mask(input_ids)
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
                 self.img_context_token_id)
@@ -674,7 +723,6 @@ def forward(
         **kwargs: object,
     ) -> Union[SamplerOutput, IntermediateTensors]:
 
-        visual_token_mask = None
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
@@ -695,15 +743,12 @@ def forward(
             "intermediate_tensors": intermediate_tensors,
             "inputs_embeds": inputs_embeds,
         }
-        if self.img_context_token_id is not None:
-            visual_token_mask = self._get_visual_token_mask(input_ids)
 
-            # We always overwrite it back to None after computing visual token
-            # mask so that this doesn't need to depend on encoder output
-            self.img_context_token_id = None
-
-        if self.is_mono:
-            forward_kwargs.update({"visual_token_mask": visual_token_mask})
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
 
         hidden_states = self.language_model.model(**forward_kwargs)
         return hidden_states
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 099ca7e12b288..5d5e8ae1ee532 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -7,7 +7,7 @@
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -25,8 +25,6 @@
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
-                                      _get_graph_batch_size)
 
 from .interfaces import HasInnerState, SupportsLoRA
 from .utils import maybe_prefix
@@ -404,7 +402,7 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (_get_graph_batch_size(
+            max_batch_size = (VllmConfig.get_graph_batch_size(
                 self.scheduler_config.max_num_seqs) if self.scheduler_config
                               else max(_BATCH_SIZES_TO_CAPTURE) + 2)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index bc24a7d306afa..2208efdf10796 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -575,10 +575,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index db7fa82ceb9b7..65c6bd07bfff0 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,37 +1,41 @@
 from functools import cached_property
+from types import MethodType
 from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
                     Tuple, TypedDict, Union)
 
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import (CLIPVisionConfig, LlavaConfig, PixtralVisionConfig,
-                          PretrainedConfig, SiglipVisionConfig)
+from PIL.Image import Image
+from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
+                          PixtralVisionConfig, PretrainedConfig,
+                          ProcessorMixin, SiglipVisionConfig)
+from transformers.models.pixtral import PixtralProcessor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        InputProcessingContext,
+                                        ModalityProcessingMetadata,
+                                        MultiModalProcessingMetadata,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
-                   dummy_seq_data_for_clip, get_max_clip_image_tokens,
-                   input_processor_for_clip)
+                   get_max_clip_image_tokens)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf,
-                      dummy_seq_data_for_pixtral_hf,
-                      get_max_pixtral_hf_image_tokens,
-                      input_processor_for_pixtral_hf)
+                      get_max_pixtral_hf_image_tokens)
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip, get_max_siglip_image_tokens,
-                     input_processor_for_siglip)
+                     get_max_siglip_image_tokens)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -59,25 +63,32 @@ class LlavaImageEmbeddingInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs]
 
 
-# TODO(xwjiang): Run benchmark and decide if TP.
 class LlavaMultiModalProjector(nn.Module):
 
-    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
-                 projector_hidden_act: str):
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
 
-        self.linear_1 = nn.Linear(vision_hidden_size,
-                                  text_hidden_size,
-                                  bias=True)
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=True,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
         self.act = get_act_fn(projector_hidden_act)
-        self.linear_2 = nn.Linear(text_hidden_size,
-                                  text_hidden_size,
-                                  bias=True)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.linear_1(image_features)
+        hidden_states, _ = self.linear_1(image_features)
         hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
         return hidden_states
 
 
@@ -104,102 +115,115 @@ def get_max_llava_image_tokens(ctx: InputContext):
         raise ValueError(f"Unexpected select feature strategy: {strategy}")
 
 
-def dummy_data_for_llava(ctx: InputContext, seq_len: int,
-                         mm_counts: Mapping[str, int]):
+def dummy_mm_kwargs_for_llava(ctx: InputProcessingContext,
+                              mm_counts: Mapping[str, int]):
     hf_config = ctx.get_hf_config(LlavaConfig)
     vision_config = hf_config.vision_config
     num_images = mm_counts["image"]
 
-    image_feature_size = get_max_llava_image_tokens(ctx)
-
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-
-        mm_data = dummy_image_for_clip(vision_config, num_images)
-        return DummyData(seq_data, mm_data, ranges)
+        data = dummy_image_for_clip(vision_config, num_images)
     elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-
-        mm_data = dummy_image_for_siglip(vision_config, num_images)
-        return DummyData(seq_data, mm_data, ranges)
+        data = dummy_image_for_siglip(vision_config, num_images)
     elif isinstance(vision_config, PixtralVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_pixtral_hf(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-
-        mm_data = dummy_image_for_pixtral_hf(vision_config, num_images)
-        return DummyData(seq_data, mm_data, ranges)
+        data = dummy_image_for_pixtral_hf(vision_config, num_images)
+    else:
+        msg = f"Unsupported vision config: {type(vision_config)}"
+        raise NotImplementedError(msg)
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    hf_processor = ctx.get_hf_processor()
+    image_processor = hf_processor.image_processor  # type: ignore
+    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
+    is_pixtral = isinstance(hf_processor, PixtralProcessor)
 
+    return MultiModalKwargs(
+        **hf_inputs,
+        is_pixtral=torch.tensor(is_pixtral),
+    )
 
-def input_processor_for_llava(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
 
-    model_config = ctx.model_config
+def create_metadata_for_llava(
+        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
     hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
+    image_token_id = hf_config.image_token_index
+
+    def get_repl_count(
+        mm_items: list[Image],
+        hf_inputs: BatchFeature,
+        item_idx: int,
+    ) -> int:
+        return get_max_llava_image_tokens(ctx)
+
+    return {
+        "image":
+        ModalityProcessingMetadata(prompt_repls=[
+            PromptReplacement(target=[image_token_id],
+                              repl_unit=[image_token_id],
+                              repl_count=get_repl_count),
+        ]),
+    }
 
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        image_feature_size = get_max_llava_image_tokens(ctx)
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = [get_max_llava_image_tokens(ctx)
-                              ] * len(image_data)
-    elif isinstance(image_data, torch.Tensor):
-        num_images, image_feature_size, hidden_size = image_data.shape
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[1] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
 
-    if isinstance(vision_config, CLIPVisionConfig):
-        return input_processor_for_clip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-    elif isinstance(vision_config, SiglipVisionConfig):
-        return input_processor_for_siglip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-    elif isinstance(vision_config, PixtralVisionConfig):
-        # We ignore image_feature_size_override since we have non-uniform
-        # image sizes for Pixtral
-        return input_processor_for_pixtral_hf(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
+class LlavaProcessor(BaseMultiModalProcessor):
+
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__(
+            ctx=ctx,
+            metadata=create_metadata_for_llava(ctx),
         )
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
+        if getattr(hf_processor, "__is_patched__", False):
+            return  # Already patched
+
+        image_processor = hf_processor.image_processor  # type: ignore
+        orig_preprocess = image_processor.preprocess
+
+        def preprocess(__self, *args, **kwargs):
+            hf_inputs = orig_preprocess(*args, **kwargs)
+            hf_inputs["is_pixtral"] = torch.tensor(True)
+            return hf_inputs
+
+        image_processor.preprocess = MethodType(preprocess, image_processor)
+
+        hf_processor.__is_patched__ = True  # type: ignore
+
+    def _get_hf_processor(self) -> ProcessorMixin:
+        hf_processor = self.ctx.get_hf_processor()
+
+        if isinstance(hf_processor, PixtralProcessor):
+            self._patch_pixtral_processor(hf_processor)
+
+        return hf_processor
+
+    def _get_dummy_mm_kwargs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalKwargs:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        vision_config = hf_config.vision_config
+        num_images = mm_counts["image"]
+
+        if isinstance(vision_config, CLIPVisionConfig):
+            data = dummy_image_for_clip(vision_config, num_images)
+        elif isinstance(vision_config, SiglipVisionConfig):
+            data = dummy_image_for_siglip(vision_config, num_images)
+        elif isinstance(vision_config, PixtralVisionConfig):
+            data = dummy_image_for_pixtral_hf(vision_config, num_images)
+        else:
+            msg = f"Unsupported vision config: {type(vision_config)}"
+            raise NotImplementedError(msg)
+
+        hf_processor = self._get_hf_processor()
+        image_processor = hf_processor.image_processor  # type: ignore
+        hf_inputs = image_processor.preprocess(data['image'],
+                                               return_tensors="pt")
+        is_pixtral = isinstance(hf_processor, PixtralProcessor)
+
+        return MultiModalKwargs(
+            **hf_inputs,
+            is_pixtral=torch.tensor(is_pixtral),
+        )
 
 
 class LlavaLikeConfig(Protocol):
@@ -282,10 +306,8 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -325,7 +347,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
-            projector_hidden_act=config.projector_hidden_act)
+            projector_hidden_act=config.projector_hidden_act,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -356,38 +380,10 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
 
         return data
 
-    def _validate_image_sizes(self, images: List[torch.Tensor],
-                              sizes: List[torch.Tensor]) -> List[torch.Tensor]:
-        if not isinstance(sizes, list):
-            sizes = [sizes]
-
-        total_images = sum(size.numel() // 2 for size in sizes)
-        if total_images != len(images):
-            raise ValueError("Mismatch in number of images. "
-                             f"Expected {total_images}, got {len(images)}")
-        img_idx = 0
-        for size in sizes:
-            # Flatten the size tensor to a list of (height, width) pairs
-            size = size.view(-1, 2).tolist()
-            for expected_h, expected_w in size:
-                if img_idx >= len(images):
-                    raise ValueError("Ran out of images before sizes. "
-                                     f"{img_idx} >= {len(images)}")
-                img = images[img_idx]
-                if img.shape[-2:] != (expected_h, expected_w):
-                    raise ValueError(
-                        "Image size mismatch. Expected "
-                        f"{(expected_h, expected_w)}, got {img.shape[-2:]}")
-                if img.shape[-3] != 3:
-                    raise ValueError("Image channel mismatch. Expected 3, "
-                                     f"got {img.shape[-3]}")
-                img_idx += 1
-        return images
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        image_sizes = kwargs.pop("image_sizes", None)
+        is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False]))
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
@@ -398,9 +394,8 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            # Case for models like PixtralHF that have dynamic image sizes
-            # so we need to produce a list of tensors
-            if image_sizes is not None:
+            assert isinstance(is_pixtral, torch.Tensor)
+            if is_pixtral.any():
                 images = pixel_values
 
                 def flatten_to_3d_tensors(item):
@@ -423,7 +418,7 @@ def flatten_to_3d_tensors(item):
 
                 return LlavaImagePixelInputs(
                     type="pixel_values",
-                    data=self._validate_image_sizes(images, image_sizes),
+                    data=images,
                 )
 
             return LlavaImagePixelInputs(
@@ -591,3 +586,28 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
+
+
+class MantisProcessor(LlavaProcessor):
+
+    def _get_hf_processor(self) -> ProcessorMixin:
+        try:
+            from mantis.models.mllava import MLlavaProcessor
+        except ModuleNotFoundError as exc:
+            raise ModuleNotFoundError(
+                "You need to `pip install "
+                "git+https://github.com/TIGER-AI-Lab/Mantis.git` "
+                "to use this model") from exc
+
+        processor = MLlavaProcessor.from_pretrained(
+            self.ctx.model_config.tokenizer)
+        assert isinstance(processor, ProcessorMixin)
+        return processor
+
+
+# To use this model, please use
+# `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
+@MULTIMODAL_REGISTRY.register_processor(MantisProcessor)
+class MantisForConditionalGeneration(LlavaForConditionalGeneration):
+    pass
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ac0d265a961f0..b32032e411b0a 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -6,7 +6,7 @@
 from transformers import MambaConfig
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -23,8 +23,6 @@
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
-                                      _get_graph_batch_size)
 
 from .utils import maybe_prefix
 
@@ -187,7 +185,7 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (_get_graph_batch_size(
+            max_batch_size = (VllmConfig.get_graph_batch_size(
                 self.scheduler_config.max_num_seqs) if self.scheduler_config
                               else max(_BATCH_SIZES_TO_CAPTURE) + 2)
             self.mamba_cache = MambaCacheManager(
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 6254d26c7060d..5a0f202364f26 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -534,6 +534,16 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 98caa6857e211..a328b5a2aeea7 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -13,6 +13,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.layer import MultiHeadAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -36,23 +37,25 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.platforms import _Backend
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
 from vllm.transformers_utils.processor import get_processor
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
-                    is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
 NUM_PREFIX_TOKENS = 1
 ADDITIONAL_VOCAB_SIZE = 128
+DEFAULT_IMAGE_PATCH_TOKEN_ID = 152066
+DEFAULT_IM_START_TOKEN_ID = 152067
+DEFAULT_IM_END_TOKEN_ID = 152064
+DEFAULT_IM_COL_TOKEN_ID = 152065
 
 
 class MolmoImageInputs(TypedDict):
@@ -76,6 +79,11 @@ class MolmoImageInputs(TypedDict):
     `(batch_size, num_crops, num_patch)`
     """
 
+    image_start_end: Tuple[int, int]
+    """Starting and ending index of placeholder 
+    tokens
+    """
+
 
 @dataclass
 class VisionBackboneConfig:
@@ -188,13 +196,11 @@ def __init__(
             quant_config=quant_config,
         )
 
-        # Detect attention implementation.
-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-        if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
-        }:
-            raise RuntimeError(
-                f"Molmo does not support {self.attn_backend} backend now.")
+        self.scale = self.head_dim**-0.5
+        self.attn = MultiHeadAttention(self.num_heads,
+                                       self.head_dim,
+                                       self.scale,
+                                       num_kv_heads=self.num_kv_heads)
 
     def forward(self,
                 inputs_q: torch.Tensor,
@@ -210,25 +216,8 @@ def forward(self,
         xq, _ = self.wq(inputs_q)
         xk, _ = self.wk(inputs_k)
         xv, _ = self.wv(inputs_v)
-        q_shape = xq.size()[:-1] + (self.num_heads, self.head_dim)
-        kv_shape = xk.size()[:-1] + (self.num_kv_heads, self.head_dim)
-        xq = xq.view(*q_shape)
-        xk = xk.view(*kv_shape)
-        xv = xv.view(*kv_shape)
-
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            from flash_attn import flash_attn_func
-            output = flash_attn_func(xq, xk, xv, dropout_p=0.0, causal=False)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
-            xq, xk, xv = (rearrange(x, "b s h d -> b h s d")
-                          for x in (xq, xk, xv))
-            output = F.scaled_dot_product_attention(xq, xk, xv)
-            output = rearrange(output, "b h s d -> b s h d ")
-        elif self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-            output = xops.memory_efficient_attention_forward(xq, xk, xv, p=0)
-
-        output = rearrange(output, "b s h d -> b s (h d)").contiguous()
+
+        output = self.attn(xq, xk, xv)
         output, _ = self.wo(output)
 
         return output
@@ -938,6 +927,8 @@ def image_input_mapper_for_molmo(
     ctx: InputContext,
     data: object,
 ):
+    if isinstance(data, list):
+        data = data[0]
     return MultiModalKwargs(data)
 
 
@@ -987,7 +978,22 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
     dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
-    return DummyData(dummy_seqdata, {"image": dummy_imgdata})
+    size = 0
+    offset = -1
+    for i in range(len(token_ids)):
+        if token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
+                            DEFAULT_IM_START_TOKEN_ID, DEFAULT_IM_END_TOKEN_ID,
+                            DEFAULT_IM_COL_TOKEN_ID):
+            if offset < 0:
+                offset = i
+            size += 1
+    dummy_imgdata["image_start_end"] = (offset, offset + size)
+    return DummyData(seq_data=dummy_seqdata,
+                     multi_modal_data={"image": dummy_imgdata},
+                     multi_modal_placeholders={
+                         "image":
+                         [PlaceholderRange(offset=offset, length=size)]
+                     })
 
 
 def pad_images(
@@ -1075,19 +1081,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
     if image_masks is not None:
         image_data["image_masks"] = image_masks
 
-    image_data["seq_len"] = torch.tensor(len(out["input_ids"]),
+    new_prompt_token_ids = out["input_ids"].tolist()
+    image_data["seq_len"] = torch.tensor(len(new_prompt_token_ids),
                                          dtype=torch.long)
 
     multi_modal_data = dict(image=image_data)
+    size = 0
+    offset = -1
+    for i in range(len(new_prompt_token_ids)):
+        if new_prompt_token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
+                                       DEFAULT_IM_START_TOKEN_ID,
+                                       DEFAULT_IM_END_TOKEN_ID,
+                                       DEFAULT_IM_COL_TOKEN_ID):
+            if offset < 0:
+                offset = i
+            size += 1
+    image_data["image_start_end"] = (offset, offset + size)
 
     prompt = inputs.get("prompt")
     if prompt is None:
-        prompt = tokenizer.decode(out["input_ids"])
+        prompt = tokenizer.decode(new_prompt_token_ids)
 
     return token_inputs(
-        prompt_token_ids=out["input_ids"],
+        prompt_token_ids=new_prompt_token_ids,
         prompt=prompt,
         multi_modal_data=multi_modal_data,
+        multi_modal_placeholders={
+            "image": [PlaceholderRange(offset=offset, length=size)]
+        },
     )
 
 
@@ -1133,6 +1154,7 @@ def _parse_and_validate_image_input(
     ) -> Optional[MolmoImageInputs]:
         images = kwargs.pop("images", None)
         image_masks = kwargs.pop("image_masks", None)
+        image_start_end = kwargs.pop("image_start_end", None)
         if images is None:
             return None
 
@@ -1150,6 +1172,7 @@ def _parse_and_validate_image_input(
             image_input_idx=image_input_idx,
             seq_len=seq_len,
             image_masks=image_masks,
+            image_start_end=image_start_end,
         )
 
     def _process_image_input(
@@ -1198,9 +1221,16 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
 
         # Note: In this original implementation from AI2, the final
         # vision_embeddings will be always be the same length
-        # of input embedddings, which is not very efficient.
-        # TODO(ywang96): see if this can be optimized.
+        # of input embeddings.
         vision_embeddings = torch.einsum('nd,nm->md', image_features, mat)
+
+        # Split by the sizes of the input sequences. For each full embedding,
+        # extract the actual vision embeddings to be merged.
+        vision_embeddings = list(vision_embeddings.split(seq_len.tolist()))
+        for i in range(len(vision_embeddings)):
+            start, end = image_input['image_start_end'][i]
+            vision_embeddings[i] = vision_embeddings[i][start:end]
+
         return vision_embeddings
 
     def get_input_embeddings(
@@ -1210,7 +1240,11 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            inputs_embeds = inputs_embeds + multimodal_embeddings
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    DEFAULT_IMAGE_PATCH_TOKEN_ID, DEFAULT_IM_START_TOKEN_ID,
+                    DEFAULT_IM_END_TOKEN_ID, DEFAULT_IM_COL_TOKEN_ID
+                ])
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index c7b4c22b6896b..34cb9981c167b 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -435,9 +435,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index eef23029a2aca..3c7854ce388ab 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -12,22 +12,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import itertools
-import re
-from functools import cached_property, lru_cache
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from functools import cached_property
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import CLIPVisionConfig, PretrainedConfig
+from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig,
+                          ProcessorMixin)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import ModelConfig, VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.config import VllmConfig
+from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -36,12 +32,18 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
-from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token
+from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        InputProcessingContext,
+                                        ModalityProcessingMetadata,
+                                        MultiModalDataDict,
+                                        MultiModalProcessingMetadata,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
+from .clip import dummy_image_for_clip
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
@@ -303,231 +305,99 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57
-def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
-    target_height = int(np.ceil(height / padding_unit) * padding_unit)
-    top_padding = int((target_height - height) / 2)
-    bottom_padding = target_height - height - top_padding
-    padded_width = width
-    padded_height = height + top_padding + bottom_padding
-    return padded_width, padded_height
-
-
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L90
-def _calc_hd_transform_size(*, width: int, height: int, hd_num: int):
-    transposed = False
-    if width < height:
-        width, height = height, width
-        transposed = True
-
-    ratio = width / height
-    scale = 1
-    while scale * np.ceil(scale / ratio) <= hd_num:
-        scale += 1
-    scale -= 1
-
-    new_width = int(scale * 336)
-    new_height = int(new_width / ratio)
-
-    padded_width, padded_height = _calc_padded_size(width=new_width,
-                                                    height=new_height)
-
-    if transposed:
-        padded_width, padded_height = padded_height, padded_width
-
-    return padded_width, padded_height
-
-
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181
-def get_phi3v_image_feature_size(
-    hf_config: Dict[str, Any],
-    *,
-    input_height: int,
-    input_width: int,
-    num_crops: int,
-) -> int:
-    if num_crops is None:
-        num_crops = hf_config.get("num_crops", 16)
-    new_width, new_height = _calc_hd_transform_size(width=input_width,
-                                                    height=input_height,
-                                                    hd_num=num_crops)
-
-    return (new_height // 336 * new_width // 336 + 1) * 144 + 1 \
-        + (new_height // 336 + 1) * 12
-
-
 def get_max_phi3v_image_tokens(ctx: InputContext,
                                *,
                                num_crops: Optional[int] = None):
+    mm_processor_kwargs = {}
+    if num_crops is not None:
+        mm_processor_kwargs["num_crops"] = num_crops
 
-    return get_phi3v_image_feature_size(
-        ctx.get_hf_image_processor_config(),
-        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        num_crops=num_crops,
+    model_config = ctx.model_config
+    image_processor = cached_get_image_processor(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        **mm_processor_kwargs,
+    )
+
+    num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
+    return num_tokens
 
 
-def dummy_data_for_phi3v(ctx: InputContext,
-                         seq_len: int,
-                         mm_counts: Mapping[str, int],
-                         *,
-                         num_crops: Optional[int] = None):
+def dummy_mm_kwargs_for_phi3v(ctx: InputProcessingContext,
+                              mm_counts: Mapping[str, int]):
     num_images = mm_counts["image"]
 
-    image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops)
-
-    seq_data, ranges = dummy_seq_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        seq_len,
-        num_images,
-        image_token_id=_IMAGE_TOKEN_ID,
-        image_feature_size_override=image_feature_size,
-    )
-    mm_data = dummy_image_for_clip(
+    data = dummy_image_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         num_images,
         image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
         image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
 
-    return DummyData(seq_data, mm_data, ranges)
-
+    hf_processor = ctx.get_hf_processor()
+    image_processor = hf_processor.image_processor  # type: ignore
+    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
 
-@lru_cache
-def _get_image_placeholder_token_id_candidates(
-    model_config: ModelConfig,
-    idx: int,
-) -> List[List[int]]:
-    assert idx > 0
+    return MultiModalKwargs(**hf_inputs)
 
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
-    # This is used when the image token is at the start of the string
-    start_candidate = tokenizer.encode(f"<|image_{idx}|>",
-                                       add_special_tokens=False)
+def create_metadata_for_phi3v(
+        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
+    return {
+        "image":
+        ModalityProcessingMetadata(prompt_repls=[
+            PromptReplacement(target=[_IMAGE_TOKEN_ID],
+                              repl_unit=[_IMAGE_TOKEN_ID],
+                              repl_count=get_max_phi3v_image_tokens(ctx)),
+        ]),
+    }
 
-    # This is used when the image token is in the middle of the string
-    # We need to get the token for "<", not "▁<"
-    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
-    a_token_id, = tokenizer.encode("a", add_special_tokens=False)
-    a_token_id_, *middle_candidate = tokenizer.encode(f"a<|image_{idx}|>",
-                                                      add_special_tokens=False)
-    assert a_token_id == a_token_id_
 
-    return [start_candidate, middle_candidate]
+class Phi3VProcessor(BaseMultiModalProcessor):
 
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__(
+            ctx=ctx,
+            metadata=create_metadata_for_phi3v(ctx),
+        )
 
-def input_processor_for_phi3v(ctx: InputContext,
-                              inputs: DecoderOnlyInputs,
-                              *,
-                              num_crops: Optional[int] = None):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_image_processor_config()
-
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        w, h = image_data.size
-        image_feature_size = [
-            get_phi3v_image_feature_size(hf_config,
-                                         input_width=w,
-                                         input_height=h,
-                                         num_crops=num_crops)
-        ]
-        image_data = [image_data]
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = []
-        for image in image_data:
-            w, h = image.size
-            image_feature_size.append(
-                get_phi3v_image_feature_size(hf_config,
-                                             input_width=w,
-                                             input_height=h,
-                                             num_crops=num_crops))
-    elif isinstance(image_data, torch.Tensor):
-        image_feature_size = [image_data.shape[0]]
-        image_data = [image_data]
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[0] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    prompt = inputs.get("prompt")
-    if prompt is None:
-        # for async server request, we assume prompt and its token_ids is always
-        # in correct format. And num_image_tags == len(image_data) always True.
-        image_idx = range(1, len(image_data) + 1)
-        new_prompt = None
-    else:
-        image_idx = sorted(map(int, re.findall(r"<\|image_(\d+)\|>+", prompt)))
-        if prompt.count("<|image|>") > 0:
-            logger.warning("Please follow the prompt format that is "
-                           "documented on HuggingFace which does not involve "
-                           "repeating <|image|> tokens.")
-        elif (num_image_tags := len(image_idx)) > 1:
-            assert num_image_tags == len(
-                image_data), "The count of image_placeholder not match image's"
-        new_prompt = prompt
-
-    prompt_token_ids = inputs["prompt_token_ids"].copy()
-
-    # masked placeholder with image token id
-    for idx in image_idx:
-        candidates = _get_image_placeholder_token_id_candidates(model_config,
-                                                                idx=idx)
-
-        for candidate in candidates:
-            for i in range(len(prompt_token_ids) - len(candidate) + 1):
-                if prompt_token_ids[i:i + len(candidate)] == candidate:
-                    prompt_token_ids[i:i +
-                                     len(candidate)] = ([_IMAGE_TOKEN_ID] *
-                                                        len(candidate))
-                    break
-
-    # merge consecutive tag ids
-    merged_token_ids: List[int] = []
-    for is_placeholder, token_ids in itertools.groupby(
-            prompt_token_ids, lambda x: x == _IMAGE_TOKEN_ID):
-        if is_placeholder:
-            merged_token_ids.append(_IMAGE_TOKEN_ID)
-        else:
-            merged_token_ids.extend(list(token_ids))
-
-    # TODO: Move this to utils or integrate with clip.
-    new_token_ids: List[int] = []
-    placeholder_ranges: List[PlaceholderRange] = []
-    placeholder_idx = 0
-    while merged_token_ids:
-        token_id = merged_token_ids.pop(0)
-        if token_id == _IMAGE_TOKEN_ID:
-            replacement_ids = repeat_and_pad_token(
-                _IMAGE_TOKEN_ID,
-                repeat_count=image_feature_size[placeholder_idx],
-            )
-            placeholder_ranges.append({
-                "offset": len(new_token_ids),
-                "length": len(replacement_ids)
-            })
-            new_token_ids.extend(replacement_ids)
-            placeholder_idx += 1
-        else:
-            new_token_ids.append(token_id)
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": placeholder_ranges})
+    def _get_hf_processor(
+        self,
+        *,
+        num_crops: Optional[int] = None,
+    ) -> ProcessorMixin:
+        if num_crops is not None:
+            return self.ctx.get_hf_processor(num_crops=num_crops)
+        return self.ctx.get_hf_processor()
+
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._apply_hf_processor(
+            prompt, mm_data, mm_processor_kwargs)
+        # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
+        # which will cause OverflowError when decoding the prompt_ids.
+        # Therefore, we need to do an early replacement here
+        token_ids = processed_outputs['input_ids']
+        token_ids[token_ids < 0] = _IMAGE_TOKEN_ID
+        processed_outputs['input_ids'] = token_ids
+        return processed_outputs
+
+    def _get_dummy_mm_kwargs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalKwargs:
+        return dummy_mm_kwargs_for_phi3v(self.ctx, mm_counts)
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 215727cadd954..c6786c363ab4a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -48,6 +48,9 @@
 except ImportError:
     USE_XFORMERS_OPS = False
 
+PIXTRAL_IMAGE_BREAK_ID = 12
+PIXTRAL_IMAGE_END_ID = 13
+
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
     tokenizer = cached_get_tokenizer(
@@ -68,7 +71,6 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
         tokenizer_mode=ctx.model_config.tokenizer_mode)
 
     mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
-    patch_size = mm_encoder.mm_config.image_patch_size
     image_token_id = mm_encoder.special_ids.img
 
     mm_config = ctx.model_config.multimodal_config
@@ -78,8 +80,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     size = 256
     image = Image.new("RGB", (size, size), color=0)
 
-    image_feature_size = (size**2) // (patch_size**2)
-
+    encoding = tokenizer.instruct.mm_encoder(ImageChunk(image=image))
+    image_feature_size = len(encoding.tokens)
     num_image_tokens = image_feature_size * num_images
     seq_data = SequenceData.from_prompt_token_counts(
         (image_token_id, num_image_tokens),
@@ -101,14 +103,13 @@ def input_mapper_for_pixtral(ctx: InputContext,
 
     Args:
         ctx: Context of the loaded model.
-        data: data potentially containing image/image embeddings to be mapped
-            to pixel_values in .forward() for a visual QWenLMHeadModel model.
+        data: data potentially containing PIL images to be processed
+            and mapped to `images`.
 
     Returns:
         MultiModalKwargs containing the stacked normalized images tensor or
         image embeddings.
     """
-    # Early exit if we have provided an image to a language only Qwen model
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(
         model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
@@ -116,35 +117,67 @@ def input_mapper_for_pixtral(ctx: InputContext,
     data_list = data if isinstance(data, list) else [data]
 
     images = []
+    image_tokens_list = []
     for image_data in data_list:
         image = ImageChunk(image=image_data)
         encoding = tokenizer.instruct.mm_encoder(image)
         image = torch.from_numpy(encoding.image).to(device="cuda",
                                                     dtype=torch.float16)
         images.append(image)
+        image_tokens_list.append(encoding.tokens)
 
-    return MultiModalKwargs({"images": images})
+    image_tokens = torch.tensor([
+        token_id for image_tokens in image_tokens_list
+        for token_id in image_tokens
+    ])
+    return MultiModalKwargs({"images": images, "image_tokens": image_tokens})
 
 
 def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
     multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is not None and "image" in multi_modal_data:
-        tokenizer = cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            tokenizer_mode=ctx.model_config.tokenizer_mode)
-
-        mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
-        image_token_id = mm_encoder.special_ids.img
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return inputs
 
-        if image_token_id not in inputs['prompt_token_ids']:
-            raise ValueError(
-                f"You've passed {inputs=} without {image_token_id=}"
-                " Make sure to process your input via mistral_common's"
-                " tokenizer or pass a chat completion request. For more"
-                " For more info, see: "
-                "https://github.com/vllm-project/vllm/issues/8411.")
+    prompt_token_ids = inputs.get("prompt_token_ids")
+    prompt = inputs.get("prompt")
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        tokenizer_mode=ctx.model_config.tokenizer_mode)
 
-    return inputs
+    mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
+    image_token_id = mm_encoder.special_ids.img
+    image_break_id = mm_encoder.special_ids.img_break
+    image_end_id = mm_encoder.special_ids.img_end
+
+    if image_token_id not in inputs['prompt_token_ids']:
+        raise ValueError(
+            f"You've passed {inputs=} without {image_token_id=}"
+            " Make sure to process your input via mistral_common's"
+            " tokenizer or pass a chat completion request. For more"
+            " For more info, see: "
+            "https://github.com/vllm-project/vllm/issues/8411.")
+
+    # Get precise tracking of placeholder positions
+    placeholder_ranges = []
+    curr_offset = -1
+    curr_length = 0
+    for i in range(len(prompt_token_ids)):
+        if prompt_token_ids[i] in (image_token_id, image_break_id):
+            if curr_offset < 0:
+                curr_offset = i
+            curr_length += 1
+        elif prompt_token_ids[i] == image_end_id:
+            curr_length += 1
+            placeholder_ranges.append(
+                PlaceholderRange(offset=curr_offset, length=curr_length))
+            curr_offset = -1
+            curr_length = 0
+        else:
+            pass
+    return token_inputs(prompt=prompt,
+                        prompt_token_ids=prompt_token_ids,
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_pixtral)
@@ -192,11 +225,29 @@ def sampler(self):
         return get_sampler()
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
-        image_input = self._parse_and_validate_image_input(**kwargs)
+        image_input, image_tokens = self._parse_and_validate_image_input(
+            **kwargs)
         if image_input is None:
             return None
+
         vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+
+        # NOTE: We patch the outputs of the vision encoder with embeddings
+        # from `[IMG_BREAK]` and `[IMG_END]` tokens.
+        image_embeds = self.language_model.get_input_embeddings(image_tokens)
+        image_token_mask = image_tokens == self.vision_args.image_token_id
+        image_embeds[image_token_mask] = vision_embeddings
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the indices of `[IMG_END]` token.
+        split_indices = torch.where(
+            image_tokens == PIXTRAL_IMAGE_END_ID)[0] + 1
+        if len(split_indices) <= 1:
+            # Do not split, return as tensor of shape [1, fs, hs]
+            return image_embeds.unsqueeze(0)
+
+        image_embeds = image_embeds.tensor_split(split_indices.cpu())
+        return image_embeds
 
     def get_input_embeddings(
         self,
@@ -206,8 +257,10 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.vision_args.image_token_id)
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    self.vision_args.image_token_id, PIXTRAL_IMAGE_END_ID,
+                    PIXTRAL_IMAGE_BREAK_ID
+                ])
         return inputs_embeds
 
     def forward(
@@ -245,10 +298,11 @@ def forward(
     def _parse_and_validate_image_input(
         self,
         images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
-                               torch.Tensor]] = None
+                               torch.Tensor]] = None,
+        image_tokens: Optional[torch.Tensor] = None,
     ) -> Optional[List[torch.Tensor]]:
         if images is None:
-            return None
+            return None, None
 
         if isinstance(images, torch.Tensor):
             # if passed as batch take all images
@@ -267,7 +321,16 @@ def _parse_and_validate_image_input(
 
             images = flatten_images
 
-        return images
+        if isinstance(image_tokens, torch.Tensor):
+            # image_tokens are batched
+            image_tokens = image_tokens.flatten()
+        elif isinstance(image_tokens, list):
+            # image_tokens are of different lengths thus passed as a list
+            image_tokens = torch.cat(image_tokens)
+
+        assert image_tokens.dim() == 1
+
+        return images, image_tokens
 
     def _process_image_input(self,
                              image_input: List[torch.Tensor]) -> torch.Tensor:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 7d4cc4b69e614..3ce4eb5869f21 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -444,14 +444,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
-        if config.tie_word_embeddings:
-            self.lm_head = self.model.embed_tokens
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
         else:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index a0605fee82aca..48a2d470414b9 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -19,7 +19,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
-from functools import lru_cache
+from functools import cached_property, lru_cache
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
@@ -34,12 +34,7 @@
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors
@@ -47,15 +42,11 @@
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import merge_multimodal_embeddings
+from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
-_KEYS_TO_MODIFY_MAPPING = {
-    "language_model.lm_head": "lm_head",
-    "language_model.model": "language_model",
-}
-
 
 # # === Audio Inputs === #
 class Qwen2AudioInputs(TypedDict):
@@ -281,25 +272,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.quant_config = quant_config
 
-        self.language_model = Qwen2Model(
-            vllm_config=vllm_config.with_hf_config(config.text_config),
-            prefix=prefix)
-        self.unpadded_vocab_size = config.text_config.vocab_size
-        if config.text_config.tie_word_embeddings:
-            self.lm_head = self.language_model.embed_tokens
-        else:
-            self.lm_head = ParallelLMHead(config.text_config.vocab_size,
-                                          config.text_config.hidden_size,
-                                          quant_config=quant_config)
-        logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.text_config.vocab_size,
-                                                logit_scale)
-        self.sampler = get_sampler()
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
     def _validate_and_reshape_mm_tensor(self,
                                         mm_input: Union[torch.Tensor,
                                                         List[torch.Tensor]],
@@ -414,72 +403,30 @@ def forward(
                                                       multimodal_embeddings)
             input_ids = None
 
-        hidden_states = self.language_model(input_ids,
-                                            positions,
-                                            kv_caches,
-                                            attn_metadata,
-                                            intermediate_tensors,
-                                            inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
 
     def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
+        return self.language_model.sample(logits, sampling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if (self.config.text_config.tie_word_embeddings
-                    and "lm_head.weight" in name):
-                continue
-            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
-                if key_to_modify in name:
-                    name = name.replace(key_to_modify, new_key)
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name or 'audio' in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 27175dbae7483..cfc90cdab01e4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
-from functools import partial
+from functools import cached_property, partial
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
                     Optional, Set, Tuple, Type, TypedDict, Union)
 
@@ -40,7 +40,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.distributed import get_pp_group, parallel_state
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -49,15 +49,12 @@
 from vllm.model_executor.layers.activation import QuickGELU
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
@@ -69,9 +66,8 @@
 from vllm.transformers_utils.processor import cached_get_processor
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (PPMissingLayer, get_vit_attn_backend,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, maybe_prefix)
+from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
+                    init_vllm_registered_model, maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -506,6 +502,8 @@ def __init__(
         mlp_ratio: float = vision_config.mlp_ratio
 
         self.spatial_merge_size = spatial_merge_size
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
 
         self.patch_embed = Qwen2VisionPatchEmbed(
             patch_size=patch_size,
@@ -595,6 +593,53 @@ def forward(
         x = self.merger(x)
         return x
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith("qkv.weight"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.embed_dim
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size,
+                                                       visual_embed_dim)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
+                elif name.endswith("qkv.bias"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.embed_dim
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1)
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 # === Vision input helpers === #
 
@@ -1082,27 +1127,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "visual"),
         )
 
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
 
-        if get_pp_group().is_last_rank:
-            if config.tie_word_embeddings:
-                self.lm_head = self.model.embed_tokens
-            else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "lm_head"))
-        else:
-            self.lm_head = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
 
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        return get_sampler()
 
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
@@ -1261,7 +1300,7 @@ def get_input_embeddings(
         multimodal_embeddings: Optional[List[Tuple[NestedTensors,
                                                    str]]] = None,
     ) -> torch.Tensor:
-        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             for embeddings, modality in multimodal_embeddings:
                 if modality == "image":
@@ -1330,7 +1369,7 @@ def forward(
                                                       multimodal_embeddings)
             input_ids = None
 
-        hidden_states = self.model(
+        hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
             kv_caches=kv_caches,
@@ -1340,80 +1379,28 @@ def forward(
         )
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
 
     def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
+        return self.language_model.sample(logits, sampling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "up_proj", 1),
-            ("gate_up_proj", "gate_proj", 0),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if "visual" in name and name.endswith("qkv.weight"):
-                    visual_num_heads = self.config.vision_config.num_heads
-                    visual_embed_dim = self.config.vision_config.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size,
-                                                       visual_embed_dim)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
-                elif "visual" in name and name.endswith("qkv.bias"):
-                    visual_num_heads = self.config.vision_config.num_heads
-                    visual_embed_dim = self.config.vision_config.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1)
-                try:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                except KeyError:
-                    raise ValueError(f"Unexpected weight: {name}") from None
-
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "lm_head.": "language_model.lm_head.",
+                "model.": "language_model.model.",
+            })
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index d8bd6d3603e84..36b39b6b773e2 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -153,6 +153,7 @@
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
+    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index deaed0ba7e4ce..6fb9e2cc4584f 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -6,12 +6,11 @@
 
 import numpy as np
 import torch
-import torch.nn.functional as F
 from PIL import Image
 from torch import nn
 from transformers import SiglipVisionConfig
 
-from vllm.attention.selector import _Backend
+from vllm.attention.layer import MultiHeadAttention
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import DecoderOnlyInputs, token_inputs
@@ -29,8 +28,6 @@
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
-from .utils import get_vit_attn_backend
-
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     # Since interpolation is applied, the image size need not be divisible
@@ -291,52 +288,18 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn_backend = get_vit_attn_backend(support_fa=False)
-        if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}:
-            raise RuntimeError(
-                f"SIGLIP does not support {self.attn_backend} backend now.")
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         """Input shape: Batch x Time x Channel"""
-        batch_size, q_len, _ = hidden_states.size()
-
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
 
-        query_states = query_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(batch_size, q_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-
-        if self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(query_states,
-                                                          key_states,
-                                                          value_states,
-                                                          p=self.dropout,
-                                                          scale=self.scale)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
-            query_states, key_states, value_states = (x.transpose(1, 2)
-                                                      for x in (query_states,
-                                                                key_states,
-                                                                value_states))
-            out = F.scaled_dot_product_attention(query_states,
-                                                 key_states,
-                                                 value_states,
-                                                 dropout_p=self.dropout,
-                                                 scale=self.scale)
-            out = out.transpose(1, 2)
-
-        out = out.view(batch_size, q_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
 
         return attn_output, None
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index f58710d215056..caae0b65d7d10 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -443,10 +443,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 7a1e1f9bf2be4..269b66806adf4 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -17,7 +17,7 @@
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available
+from vllm.utils import is_pin_memory_available, print_warning_once
 
 logger = init_logger(__name__)
 
@@ -251,12 +251,15 @@ def init_vllm_registered_model(
     """
     from vllm.model_executor.model_loader.loader import _initialize_model
 
+    if hf_config is None and architectures is not None:
+        # So that the architectures field is overridden
+        hf_config = vllm_config.model_config.hf_config
+
     if hf_config is not None:
-        vllm_config = vllm_config.with_hf_config(hf_config)
+        vllm_config = vllm_config.with_hf_config(hf_config,
+                                                 architectures=architectures)
 
-    return _initialize_model(vllm_config=vllm_config,
-                             prefix=prefix,
-                             architectures=architectures)
+    return _initialize_model(vllm_config=vllm_config, prefix=prefix)
 
 
 @overload
@@ -406,16 +409,42 @@ def merge_multimodal_embeddings(
     input_ids: torch.Tensor,
     inputs_embeds: torch.Tensor,
     multimodal_embeddings: NestedTensors,
-    placeholder_token_id: int,
+    placeholder_token_id: Union[int, List[int]],
 ) -> torch.Tensor:
     """
     Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
     positions in ``inputs_embeds`` corresponding to placeholder tokens in
     ``input_ids``.
+    
+    ``placeholder_token_id`` can be a list of token ids (e.g, token ids 
+    of img_start, img_break, and img_end tokens) when needed: This means 
+    the order of these tokens in the ``input_ids`` MUST MATCH the order of 
+    their embeddings in ``multimodal_embeddings`` since we need to 
+    slice-merge instead of individually scattering.
+
+    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
+    - T is text token
+    - S is image start token
+    - I is image embedding token
+    - B is image break token
+    - E is image end token.
+    
+    Then the image embeddings (that correspond to I's) from vision encoder 
+    must be padded with embeddings of S, B, and E in the same order of 
+    input_ids for a correct embedding merge.
 
     Note:
         This updates ``inputs_embeds`` in place.
     """
+    if isinstance(placeholder_token_id, list):
+        placeholder_token_id = torch.tensor(placeholder_token_id,
+                                            device=input_ids.device)
+        return _merge_multimodal_embeddings(
+            inputs_embeds,
+            torch.isin(input_ids, placeholder_token_id),
+            multimodal_embeddings,
+        )
+
     return _merge_multimodal_embeddings(
         inputs_embeds,
         (input_ids == placeholder_token_id),
@@ -592,7 +621,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                logger.warning(
+                print_warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 03a5f3a91f7a1..928c31a2f2843 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -27,18 +27,3 @@
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "MultiModalInputs":
-        msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
-               "The original name will take another meaning in an upcoming "
-               "version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return MultiModalKwargs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index bbb8fb4bc1cd1..7dba94b885b6d 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -226,16 +226,16 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
+        from vllm.model_executor.models import supports_multimodal
 
         model_cls, _ = get_model_architecture(model_config)
 
-        if model_cls not in self._input_mappers:
+        if not supports_multimodal(model_cls):
             return 0
 
         max_mm_tokens = self._max_mm_tokens.get(model_cls)
         if max_mm_tokens is None:
-            raise KeyError(f"No maximum number of multi-modal tokens is given "
-                           f"for model class {model_cls.__name__} in {self}.")
+            return 0
 
         if callable(max_mm_tokens):
             mm_processor_kwargs = get_allowed_kwarg_only_overrides(
@@ -326,26 +326,47 @@ def from_seq_group(
             src_ranges  = []
             dest_ranges = []
         """
-        if (not seq_group.multi_modal_data
-                or not seq_group.multi_modal_placeholders):
-            return seq_group.multi_modal_data, {}
+        seq_mm_data = seq_group.multi_modal_data
+        seq_mm_placeholders = seq_group.multi_modal_placeholders
+
+        if not seq_mm_data or not seq_mm_placeholders:
+            return seq_mm_data, {}
+
+        # For merged processor, we directly use mm_kwargs as mm_data
+        if isinstance(seq_mm_data, MultiModalKwargs):
+            placeholder_maps = dict[str, MultiModalPlaceholderMap]()
+
+            for modality, placeholders in seq_mm_placeholders.items():
+                placeholder_map = MultiModalPlaceholderMap()
+
+                if positions:
+                    placeholder_map.append_items_from_seq_group(
+                        positions,
+                        # Dummy, since we don't care about intersecting items
+                        [None] * len(placeholders),
+                        placeholders,
+                    )
+
+                placeholder_maps[modality] = placeholder_map
+
+            return seq_mm_data, placeholder_maps
 
-        mm_data = {**seq_group.multi_modal_data}
-        placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict(
+        mm_data = {**seq_mm_data}
+        placeholder_maps = defaultdict[str, MultiModalPlaceholderMap](
             MultiModalPlaceholderMap)
 
-        for (
-                modality,
-                placeholders,
-        ) in seq_group.multi_modal_placeholders.items():
+        for modality, placeholders in seq_mm_placeholders.items():
             mm_items = mm_data.pop(modality)
             if not isinstance(mm_items, list):
                 mm_items = [mm_items]
 
             if positions:
-                intersecting_items = placeholder_maps[
-                    modality].append_items_from_seq_group(
-                        positions, mm_items, placeholders)
+                intersecting_items = placeholder_maps[modality] \
+                    .append_items_from_seq_group(
+                        positions,
+                        mm_items,
+                        placeholders,
+                    )
 
                 if intersecting_items:
                     mm_data[modality] = intersecting_items
@@ -433,18 +454,3 @@ def index_map(self) -> "IndexMap":
 
         return MultiModalPlaceholderMap.IndexMap(src=src_indices,
                                                  dest=dest_indices)
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "MultiModalInputs":
-        msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
-               "The original name will take another meaning in an upcoming "
-               "version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return MultiModalKwargs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 640c7c04b8817..229a8fbdf5831 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -96,7 +96,8 @@ class PlaceholderRange(TypedDict):
     """The length of the placeholder."""
 
 
-NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
+NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor,
+                      Tuple[torch.Tensor, ...]]
 """
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 28c8dda581982..922c83b6fd8a9 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -3,16 +3,17 @@
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import lru_cache
-from itertools import groupby
-from typing import Any, Generic, NamedTuple, Optional, Protocol, TypeVar, Union
+from typing import (Any, Dict, Generic, NamedTuple, Optional, Protocol,
+                    TypeVar, Union, cast)
 
-import numpy as np
-from transformers import BatchFeature
+import torch
+from transformers import BatchFeature, ProcessorMixin
 from typing_extensions import TypeAlias, TypedDict
 
-from vllm.inputs import InputProcessingContext
+from vllm.inputs import DummyData, InputProcessingContext
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
+from vllm.utils import (flatten_2d_lists, full_groupby, is_list_of,
+                        resolve_mm_processor_kwargs)
 
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                      MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
@@ -256,63 +257,6 @@ def to_multi_format(data: MultiModalDataDict) -> dict[str, list[Any]]:
     return multi_data
 
 
-class _TokenRun(NamedTuple):
-    token_id: int
-
-    start_idx: int
-    length: int
-
-
-def iter_token_runs(token_ids: list[int]) -> Iterable[_TokenRun]:
-    """
-    Yield the starting index and length of each run of tokens that are the same.
-    """
-    start_idx = 0
-
-    for token_id, it in groupby(token_ids):
-        length = sum(1 for _ in it)
-        yield _TokenRun(token_id=token_id, start_idx=start_idx, length=length)
-
-        start_idx += length
-
-
-class _PlaceholderInfo(NamedTuple):
-    modality: str
-    offset: int
-    length: int
-
-    def to_range(self) -> PlaceholderRange:
-        return PlaceholderRange(offset=self.offset, length=self.length)
-
-
-def iter_placeholders(
-    prompt_repls: Sequence[_BoundPromptReplacement[Any]],
-    token_ids: list[int],
-    *,
-    min_placeholder_count: int,
-) -> Iterable[_PlaceholderInfo]:
-    """Yield each set of placeholder tokens found in :code:`token_ids`."""
-    placeholder_ids_by_modality = {
-        modality: {
-            token_id
-            for prompt_repl in repls
-            for token_id in prompt_repl.repl_unit.token_ids
-        }
-        for modality, repls in full_groupby_modality(prompt_repls)
-    }
-
-    for run_info in iter_token_runs(token_ids):
-        if run_info.length > min_placeholder_count:
-            for (modality,
-                 placeholder_ids) in placeholder_ids_by_modality.items():
-                if run_info.token_id in placeholder_ids:
-                    yield _PlaceholderInfo(
-                        modality=modality,
-                        offset=run_info.start_idx,
-                        length=run_info.length,
-                    )
-
-
 class _TokenMatch(NamedTuple):
     start_idx: int
     end_idx: int
@@ -353,13 +297,9 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         raise NotImplementedError
 
+    @property
     @abstractmethod
-    def get_repl(
-        self,
-        mm_items: list[_T],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> _S:
+    def repl_unit(self) -> _S:
         raise NotImplementedError
 
     def __repr__(self) -> str:
@@ -380,15 +320,9 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end_idx
 
-    def get_repl(
-        self,
-        mm_items: list[_T],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> list[int]:
-        prompt_repl = self.prompt_repl
-        count = prompt_repl.get_count(mm_items, hf_inputs, item_idx)
-        return prompt_repl.repl_unit.token_ids * count
+    @property
+    def repl_unit(self) -> list[int]:
+        return self.prompt_repl.repl_unit.token_ids
 
 
 @dataclass(repr=False)
@@ -404,15 +338,26 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end()
 
-    def get_repl(
-        self,
-        mm_items: list[_T],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> str:
-        prompt_repl = self.prompt_repl
-        count = prompt_repl.get_count(mm_items, hf_inputs, item_idx)
-        return prompt_repl.repl_unit.text * count
+    @property
+    def repl_unit(self) -> str:
+        return self.prompt_repl.repl_unit.text
+
+
+class _PlaceholderInfo(NamedTuple):
+    modality: str
+    start_idx: int
+    unit: list[int]
+    unit_count: int
+
+    @property
+    def length(self) -> int:
+        return len(self.unit) * self.unit_count
+
+    def to_range(self) -> PlaceholderRange:
+        return PlaceholderRange(
+            offset=self.start_idx,
+            length=self.length,
+        )
 
 
 def find_token_matches(
@@ -447,15 +392,17 @@ def _resolve_matches(
     Resolve :code:`matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
-    num_matches_by_idx = np.zeros(len(prompt), dtype=int)
+    seen_matches: list[Optional[_PromptReplacementMatch[_T, _S]]] \
+        = [None] * len(prompt)
+
     for match in matches:
-        num_matches_by_idx[match.start_idx:match.end_idx] += 1
+        for idx in range(match.start_idx, match.end_idx):
+            if seen_matches[idx] is not None:
+                raise ValueError("Found overlapping matches "
+                                 f"({seen_matches[idx]} and {match}) "
+                                 f"at index={idx} of prompt={prompt}")
 
-    duplicate_matches_idxs, = np.nonzero(num_matches_by_idx > 1)
-    if len(duplicate_matches_idxs) > 0:
-        raise ValueError("Unable to find a unique replacement "
-                         f"at indices={duplicate_matches_idxs} "
-                         f"of prompt={prompt}")
+            seen_matches[idx] = match
 
     return sorted(matches, key=lambda x: x.start_idx)
 
@@ -480,9 +427,12 @@ def _replace_matches(
 
         start_idx = match.start_idx
         end_idx = match.end_idx
-        repl_ids = match.get_repl(mm_items, hf_inputs, item_idx)
+        repl_unit = match.repl_unit
+        repl_info = match.prompt_repl
+        repl_count = repl_info.get_count(mm_items, hf_inputs, item_idx)
 
-        out_seqs.append(prompt[prev_end_idx:start_idx] + repl_ids)
+        out_seqs.append(prompt[prev_end_idx:start_idx] +
+                        repl_unit * repl_count)
         prev_end_idx = end_idx
         next_idx_by_modality[modality] += 1
 
@@ -531,9 +481,59 @@ def replace_text_matches(
     return "".join(texts)
 
 
-class MultiModalProcessor:
+def _merge_placeholder_matches(
+    matches: Iterable[_PromptReplacementTokenMatch],
+) -> Iterable[_PromptReplacementTokenMatch]:
+    current_match = None
+
+    for match in sorted(matches, key=lambda x: x.start_idx):
+        if current_match is None:
+            current_match = match
+        elif (current_match.prompt_repl == match.prompt_repl
+              and current_match.end_idx == match.start_idx):
+            current_match = _PromptReplacementTokenMatch(
+                current_match.prompt_repl,
+                match=_TokenMatch(current_match.start_idx, match.end_idx),
+            )
+        else:
+            yield current_match
+            current_match = match
+
+    if current_match is not None:
+        yield current_match
+
+
+def iter_placeholders(
+    prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+    prompt: list[int],
+    *,
+    min_unit_count: int = 1,
+) -> Iterable[_PlaceholderInfo]:
+    """Yield each set of placeholder tokens found in :code:`token_ids`."""
+    if min_unit_count <= 0:
+        raise ValueError("`min_unit_count` must be a positive integer")
+
+    matches = (_PromptReplacementTokenMatch(prompt_repl, match)
+               for prompt_repl in prompt_repls
+               if len(repl_unit := prompt_repl.repl_unit.token_ids) > 0
+               for match in iter_token_matches(prompt, repl_unit))
+
+    for match in _merge_placeholder_matches(matches):
+        unit = match.repl_unit
+        placeholder = _PlaceholderInfo(
+            modality=match.modality,
+            start_idx=match.start_idx,
+            unit=unit,
+            unit_count=(match.end_idx - match.start_idx) // len(unit),
+        )
+
+        if placeholder.unit_count >= min_unit_count:
+            yield placeholder
+
+
+class BaseMultiModalProcessor(ABC):
     """
-    Helper class to process multi-modal inputs to be used in vLLM.
+    Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
     def __init__(
@@ -545,6 +545,18 @@ def __init__(
 
         self.ctx = ctx
         self.metadata = metadata
+        self.init_mm_processor_kwargs = (ctx.model_config.mm_processor_kwargs
+                                         or {})
+
+    def _get_hf_processor(
+        self,
+        **mm_processor_kwargs: Mapping[str, object],
+    ) -> ProcessorMixin:
+        # by default, we won't pass any kwargs to the processor initialization
+        return self.ctx.get_hf_processor()
+
+    def _get_tokenizer(self) -> AnyTokenizer:
+        return self.ctx.tokenizer
 
     def __call__(
         self,
@@ -562,13 +574,13 @@ def _find_placeholders(
         # To avoid false positives from multi-input when detecting
         # whether placeholder tokens have been inserted, in case
         # the target sequence is a subset of the replacement tokens
-        min_placeholder_count: int = 16,
+        min_unit_count: int = 16,
     ) -> list[_PlaceholderInfo]:
         return list(
             iter_placeholders(
                 all_prompt_repls,
                 new_token_ids,
-                min_placeholder_count=min_placeholder_count,
+                min_unit_count=min_unit_count,
             ))
 
     def _apply_hf_processor(
@@ -577,19 +589,62 @@ def _apply_hf_processor(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        hf_processor = self.ctx.get_hf_processor()
-
-        return hf_processor(
-            text=prompt,  # type: ignore
-            **mm_data,
+        # some mm_processor_kwargs may be used in processor initialization
+        # instead of processor call
+        processor_init_kwargs = {
+            **self.init_mm_processor_kwargs,
             **mm_processor_kwargs,
+        }
+        hf_processor = self._get_hf_processor(**processor_init_kwargs)
+
+        processor_data = dict[str, Any]()
+        passthrough_data = dict[str, Any]()
+        for k, v in mm_data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            if k in ("image", "video", "audio"):
+                if isinstance(v, torch.Tensor) and v.ndim == 3:
+                    # Pass through embedding inputs (single)
+                    passthrough_data[f"{k}_embeds"] = [v]
+                elif is_list_of(v, torch.Tensor) and v[0].ndim == 2:
+                    # Pass through embedding inputs (multi)
+                    passthrough_data[f"{k}_embeds"] = v
+                else:
+                    # Map keys to plural form, e.g.: image -> images
+                    processor_data[f"{k}s"] = v
+            else:
+                processor_data[k] = v
+
+        # filter mm_processor_kwargs used in processor call
+        mm_processor_kwargs = resolve_mm_processor_kwargs(
+            self.init_mm_processor_kwargs,
+            cast(Dict[str, Any], mm_processor_kwargs),
+            hf_processor,
         )
 
+        try:
+            hf_inputs = hf_processor(
+                text=prompt,  # type: ignore
+                **processor_data,
+                **mm_processor_kwargs,
+                return_tensors="pt",
+            )
+        except Exception as exc:
+            data = dict(text=prompt, **processor_data)
+
+            raise RuntimeError(
+                f"Failed to apply {type(hf_processor).__name__} "
+                f"on data={data} with kwargs={mm_processor_kwargs}") from exc
+
+        hf_inputs.update(passthrough_data)
+
+        return hf_inputs
+
     def _bind_prompt_replacements(
         self,
         mm_data: MultiModalDataDict,
     ) -> list[_BoundPromptReplacement[Any]]:
-        tokenizer = self.ctx.tokenizer
+        tokenizer = self._get_tokenizer()
 
         return [
             prompt_repl.bind(modality, tokenizer)
@@ -604,7 +659,7 @@ def _apply_prompt_replacements(
         token_ids: list[int],
         prompt_repls: Sequence[_BoundPromptReplacement[Any]],
     ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
-        tokenizer = self.ctx.tokenizer
+        tokenizer = self._get_tokenizer()
 
         mm_items = to_multi_format(mm_data)
         token_matches = find_token_matches(token_ids, prompt_repls)
@@ -620,7 +675,7 @@ def _apply_prompt_replacements(
         # of the search text in the prompt, we instead perform string
         # replacement on the decoded token IDs, then encode them back.
         if all(
-            len(matches) >= len(mm_data[modality])
+            len(matches) >= len(mm_items[modality])
             for modality, matches in full_groupby_modality(token_matches)
         ):  # yapf: disable
             token_ids = replace_token_matches(
@@ -648,15 +703,6 @@ def _apply_prompt_replacements(
 
         placeholders = self._find_placeholders(matched_repls, token_ids)
 
-        # Sanity check
-        assert len(placeholders) == len(matched_repls), dict(
-            # Log this information for easier debugging
-            text=text,
-            token_ids=token_ids,
-            placeholders=placeholders,
-            matched_repls=matched_repls,
-        )
-
         return token_ids, text, placeholders
 
     def apply(
@@ -678,7 +724,7 @@ def apply(
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
-        tokenizer = self.ctx.tokenizer
+        tokenizer = self._get_tokenizer()
 
         hf_inputs = self._apply_hf_processor(prompt_text, mm_data,
                                              mm_processor_kwargs)
@@ -717,3 +763,59 @@ def apply(
             mm_kwargs=mm_kwargs,
             mm_placeholders=mm_placeholders,
         )
+
+    @abstractmethod
+    def _get_dummy_mm_kwargs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalKwargs:
+        """
+        Build the input that corresponds to `mm_max_tokens` in
+        :meth:`get_dummy_data`.
+        """
+        raise NotImplementedError
+
+    def get_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_max_tokens: Mapping[str, int],
+    ) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        tokenizer = self._get_tokenizer()
+
+        mm_placeholders = dict[str, _PlaceholderInfo]()
+        offset = 0
+
+        for modality, max_tokens in mm_max_tokens.items():
+            if max_tokens == 0:
+                continue
+
+            metadata = self.metadata[modality]
+            repl = metadata.prompt_repls[0].bind(modality, tokenizer)
+            repl_token_ids = repl.repl_unit.token_ids
+
+            placeholders = _PlaceholderInfo(
+                modality=modality,
+                start_idx=offset,
+                unit=repl_token_ids,
+                unit_count=max_tokens // len(repl_token_ids),
+            )
+
+            mm_placeholders[modality] = placeholders
+            offset += placeholders.length
+
+        prompt_token_ids = flatten_2d_lists(
+            [p.unit * p.unit_count for p in mm_placeholders.values()])
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(prompt_token_ids),
+            multi_modal_data=self._get_dummy_mm_kwargs(mm_counts),
+            multi_modal_placeholders={
+                modality: [p.to_range()]
+                for modality, p in mm_placeholders.items()
+            },
+        )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index b73daee98bd80..6ab6c0fe2f12e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -15,7 +15,7 @@
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import MultiModalProcessor
+from .processing import BaseMultiModalProcessor
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -26,7 +26,7 @@
 N = TypeVar("N", bound=Type[nn.Module])
 
 MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext],
-                                                 MultiModalProcessor]
+                                                 BaseMultiModalProcessor]
 """
 Constructs a :class:`MultiModalProcessor` instance from the context.
 
@@ -200,9 +200,12 @@ def register_max_image_tokens(
         """
         return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
-    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
+    def get_max_tokens_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
         """
-        Get the maximum number of multi-modal tokens
+        Get the maximum number of tokens from each modality
         for profiling the memory usage of a model.
 
         See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
@@ -212,9 +215,23 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         limits_per_plugin = self._limits_by_model[model_config]
 
-        return sum((limits_per_plugin[key] *
-                    plugin.get_max_multimodal_tokens(model_config))
-                   for key, plugin in self._plugins.items())
+        return {
+            key: (limits_per_plugin[key] *
+                  plugin.get_max_multimodal_tokens(model_config))
+            for key, plugin in self._plugins.items()
+        }
+
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+
+        Note:
+            This should be called after :meth:`init_mm_limits_per_prompt`.
+        """
+        return sum(self.get_max_tokens_by_modality(model_config).values())
 
     def init_mm_limits_per_prompt(
         self,
@@ -270,7 +287,8 @@ def register_processor(
         factory: MultiModalProcessorFactory,
     ):
         """
-        Register a multi-modal processor to a model class.
+        Register a multi-modal processor to a model class. The processor
+        is constructed lazily, hence a factory method should be passed.
 
         When the model receives multi-modal data, the provided function is
         invoked to transform the data into a dictionary of model inputs.
@@ -307,7 +325,7 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
-    ) -> MultiModalProcessor:
+    ) -> BaseMultiModalProcessor:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index d4333b7519b47..c898ca4e6573e 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -535,11 +535,13 @@ def repeat_and_pad_placeholder_tokens(
     return new_prompt, new_token_ids, placeholder_ranges
 
 
-def consecutive_placeholder_ranges(num_items: int,
-                                   item_size: int) -> List[PlaceholderRange]:
+def consecutive_placeholder_ranges(
+        num_items: int,
+        item_size: int,
+        initial_offset: int = 0) -> List[PlaceholderRange]:
     """Returns a list of consecutive PlaceholderRanges of a fixed size"""
 
     return [
-        PlaceholderRange(offset=i * item_size, length=item_size)
-        for i in range(num_items)
+        PlaceholderRange(offset=initial_offset + i * item_size,
+                         length=item_size) for i in range(num_items)
     ]
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 7cb8ac4b0a1e0..419237c252ffd 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,5 +1,5 @@
 from .interface import _Backend  # noqa: F401
-from .interface import Platform, PlatformEnum, UnspecifiedPlatform
+from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform
 
 current_platform: Platform
 
@@ -120,4 +120,4 @@ def cuda_is_jetson() -> bool:
 else:
     current_platform = UnspecifiedPlatform()
 
-__all__ = ['Platform', 'PlatformEnum', 'current_platform']
+__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum']
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index b5333fbd6f502..e5142b985d1f2 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import psutil
 import torch
@@ -37,6 +37,10 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         return psutil.virtual_memory().total
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
@@ -46,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         import vllm.envs as envs
         from vllm.utils import GiB_bytes
         model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if not model_config.enforce_eager:
             logger.warning(
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 846a1869da228..edaf377b501df 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,7 +4,7 @@
 
 import os
 from functools import lru_cache, wraps
-from typing import TYPE_CHECKING, Callable, List, TypeVar
+from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar
 
 import pynvml
 import torch
@@ -88,6 +88,16 @@ def get_device_name(cls, device_id: int = 0) -> str:
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         raise NotImplementedError
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
     @classmethod
     def is_full_nvlink(cls, device_ids: List[int]) -> bool:
         raise NotImplementedError
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 10aaa6d54962c..7f22bee3eaa74 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -20,6 +20,10 @@ class HpuPlatform(Platform):
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
         return _Backend.HPU_ATTN
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
     @staticmethod
     def inference_mode():
         return torch.no_grad()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index eac2b413f9271..db06d2c18e681 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,15 +1,20 @@
 import enum
+import platform
 import random
 from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
 import torch
 
+from vllm.logger import init_logger
+
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
@@ -37,6 +42,14 @@ class PlatformEnum(enum.Enum):
     UNSPECIFIED = enum.auto()
 
 
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    POWERPC = enum.auto()
+    OTHER = enum.auto()
+    UNKNOWN = enum.auto()
+
+
 class DeviceCapability(NamedTuple):
     major: int
     minor: int
@@ -138,6 +151,13 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         """Get the total memory of a device in bytes."""
         raise NotImplementedError
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        """
+        Check if the current platform supports async output.
+        """
+        raise NotImplementedError
+
     @classmethod
     def inference_mode(cls):
         """A device-specific wrapper of `torch.inference_mode`.
@@ -184,6 +204,23 @@ def verify_quantization(cls, quant: str) -> None:
                 f"{quant} quantization is currently not supported in "
                 f"{cls.device_name}.")
 
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """
+        Determine the CPU architecture of the current system.
+        Returns CpuArchEnum indicating the architecture type.
+        """
+        machine = platform.machine().lower()
+
+        if machine in ("x86_64", "amd64", "i386", "i686"):
+            return CpuArchEnum.X86
+        elif machine.startswith("arm") or machine.startswith("aarch"):
+            return CpuArchEnum.ARM
+        elif machine.startswith("ppc"):
+            return CpuArchEnum.POWERPC
+
+        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 87655ea198303..1e5c4bddfa24f 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 from .interface import Platform, PlatformEnum
 
@@ -18,6 +18,10 @@ class NeuronPlatform(Platform):
     def get_device_name(cls, device_id: int = 0) -> str:
         return "neuron"
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 29b61e955d9ab..e0f8e8b4b49fe 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -37,6 +37,10 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
     def get_device_name(self, device_id: int = 0) -> str:
         return "openvino"
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
     @classmethod
     def inference_mode(self):
         return torch.inference_mode(mode=True)
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d2f7cd40e25b2..a553956d6cd07 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,6 +1,6 @@
 import os
 from functools import lru_cache, wraps
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 
 import torch
 from amdsmi import (AmdSmiException, amdsmi_get_gpu_board_info,
@@ -139,6 +139,16 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         device_props = torch.cuda.get_device_properties(device_id)
         return device_props.total_memory
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index b138f7e1c54c5..10d874349f36b 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -35,6 +35,10 @@ def get_device_name(cls, device_id: int = 0) -> str:
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         raise NotImplementedError
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 9665786f4c499..11dbd04d55671 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -41,6 +41,10 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         device_props = torch.xpu.get_device_properties(device_id)
         return device_props.total_memory
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
     @staticmethod
     def inference_mode():
         return torch.no_grad()
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 3c64726ca3344..17f604ea0e202 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -4,6 +4,7 @@
 import torch
 
 import vllm.envs as envs
+from vllm.platforms import current_platform
 
 logger = logging.getLogger(__name__)
 
@@ -25,6 +26,23 @@ def load_general_plugins():
     os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
     # see https://github.com/vllm-project/vllm/issues/10619
     torch._inductor.config.compile_threads = 1
+    if current_platform.is_xpu():
+        # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
+        os.environ['TORCH_COMPILE_DISABLE'] = 'True'
+    if current_platform.is_hpu():
+        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
+        # does not support torch.compile
+        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
+        # torch.compile support
+        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
+        if is_lazy:
+            # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+            torch._dynamo.config.disable = True
+            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
+            # requires enabling lazy collectives
+            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
+            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+
     global plugins_loaded
     if plugins_loaded:
         return
@@ -39,7 +57,7 @@ def load_general_plugins():
 
     discovered_plugins = entry_points(group='vllm.general_plugins')
     if len(discovered_plugins) == 0:
-        logger.info("No plugins found.")
+        logger.debug("No plugins found.")
         return
     logger.info("Available plugins:")
     for plugin in discovered_plugins:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 166a33c03451e..0310be0d9d886 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -305,8 +305,9 @@ def __post_init__(self) -> None:
                 raise ValueError(
                     f"best_of must be greater than or equal to n, "
                     f"got n={self.n} and best_of={self.best_of}.")
-            self._real_n = self.n
-            self.n = self.best_of
+            if not self._real_n:
+                self._real_n = self.n
+                self.n = self.best_of
 
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index d249b37c780e4..676ac5eb3609d 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -120,6 +120,9 @@ def sampler_output(
                     indices_of_seq_with_bonus_tokens)
                 model_outputs.append(model_output)
 
+        # move indices to device to avoid stream sync
+        indices_of_seq_with_bonus_tokens = torch.tensor(
+            indices_of_seq_with_bonus_tokens, device=self.device)
         filtered_model_outputs = self._filter_model_output(
             model_outputs, indices_of_seq_with_bonus_tokens)
         return filtered_model_outputs, True
@@ -189,7 +192,7 @@ def _expand_execute_model_request(
     @staticmethod
     def _filter_model_output(
             expanded_batch_outputs: List[SamplerOutput],
-            output_indices_to_retain: List[int]) -> List[SamplerOutput]:
+            output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]:
         """
         Filters the model output to include only the specified sequence
         outputs. This method contracts the expanded batch output from the
@@ -199,8 +202,8 @@ def _filter_model_output(
         Args:
             expanded_batch_output (List[SamplerOutput]): The expanded output
                 batch from the model.
-            output_indices_to_retain (List[int]): Indices of the model outputs
-                to retain.
+            output_indices_to_retain (torch.Tensor): Indices of the model
+                outputs to retain.
 
         Returns:
             List[SamplerOutput]: A list containing the filtered model 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 53634f7b0b366..2689802161987 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -54,6 +54,10 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     speculative_config: SpeculativeConfig = vllm_config.speculative_config
     assert speculative_config is not None
 
+    if vllm_config.parallel_config.pipeline_parallel_size > 1:
+        raise NotImplementedError("Speculative decoding is currently "
+                                  "incompatible with pipeline parallelism")
+
     draft_worker_kwargs = kwargs.copy()
 
     kwargs["model_runner_cls"] = TargetModelRunner
@@ -104,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.rst
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 6a114b513f382..c0b3d2585a962 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,7 +1,7 @@
 from typing import Optional, Type
 
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         TokenizerPoolConfig)
+from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, TokenizerPoolConfig)
 from vllm.executor.ray_utils import ray
 
 from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
@@ -16,10 +16,11 @@
 def init_tokenizer_from_configs(model_config: ModelConfig,
                                 scheduler_config: SchedulerConfig,
                                 parallel_config: ParallelConfig,
-                                enable_lora: bool):
+                                lora_config: LoRAConfig):
     init_kwargs = dict(tokenizer_id=model_config.tokenizer,
-                       enable_lora=enable_lora,
+                       enable_lora=bool(lora_config),
                        max_num_seqs=scheduler_config.max_num_seqs,
+                       max_loras=lora_config.max_loras if lora_config else 0,
                        max_input_length=None,
                        tokenizer_mode=model_config.tokenizer_mode,
                        trust_remote_code=model_config.trust_remote_code,
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index e516eeabaadef..761b07f34d2f9 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -21,8 +21,9 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.enable_lora = enable_lora
         self.max_input_length = max_input_length
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
+        max_loras = tokenizer_config.get("max_loras", 0)
         self.lora_tokenizers = LRUCache[AnyTokenizer](
-            capacity=max_num_seqs if enable_lora else 0)
+            capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
 
     @classmethod
     def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
diff --git a/vllm/utils.py b/vllm/utils.py
index 04e48bfb6b645..35940c72f5d9b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -24,9 +24,9 @@
 from collections.abc import Iterable, Mapping
 from functools import lru_cache, partial, wraps
 from platform import uname
-from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic,
-                    Hashable, List, Literal, Optional, OrderedDict, Set, Tuple,
-                    Type, TypeVar, Union, overload)
+from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
+                    Dict, Generic, Hashable, List, Literal, Optional,
+                    OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
 from uuid import uuid4
 
 import numpy as np
@@ -43,11 +43,14 @@
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.platforms import current_platform
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
 logger = init_logger(__name__)
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.rst
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
@@ -452,17 +455,6 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-@lru_cache(maxsize=None)
-def get_vllm_instance_id() -> str:
-    """
-    If the environment variable VLLM_INSTANCE_ID is set, return it.
-    Otherwise, return a random UUID.
-    Instance id represents an instance of the VLLM. All processes in the same
-    instance should have the same instance id.
-    """
-    return envs.VLLM_INSTANCE_ID or f"vllm-instance-{random_uuid()}"
-
-
 @lru_cache(maxsize=None)
 def in_wsl() -> bool:
     # Reference: https://github.com/microsoft/WSL/issues/4071
@@ -1114,7 +1106,7 @@ def find_nccl_library() -> str:
     return so_file
 
 
-def enable_trace_function_call_for_thread() -> None:
+def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
     """
@@ -1126,7 +1118,8 @@ def enable_trace_function_call_for_thread() -> None:
         filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
                     f"_thread_{threading.get_ident()}_"
                     f"at_{datetime.datetime.now()}.log").replace(" ", "_")
-        log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(),
+        log_path = os.path.join(tmp_dir, "vllm",
+                                f"vllm-instance-{vllm_config.instance_id}",
                                 filename)
         os.makedirs(os.path.dirname(log_path), exist_ok=True)
         enable_trace_function_call(log_path)
@@ -1772,7 +1765,7 @@ def direct_register_custom_op(
     library object. If you want to bind the operator to a different library,
     make sure the library object is alive when the operator is used.
     """
-    if is_in_doc_build():
+    if is_in_doc_build() or not supports_custom_op():
         return
     import torch.library
     if hasattr(torch.library, "infer_schema"):
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index d37989055c2e5..251a103e60f06 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -138,14 +138,25 @@ def forward(
             # Profiling run.
             return output
 
-        num_actual_tokens = attn_metadata.num_actual_tokens
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
 
+        num_actual_tokens = attn_metadata.num_actual_tokens
         # Reshape the input keys and values and store them in the cache.
-        key_cache = kv_cache[0]
-        value_cache = kv_cache[1]
+        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+        # not padded. However, we don't need to do key[:num_actual_tokens] and
+        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
+        # the slot_mapping's shape to determine the number of actual tokens.
+        key_cache, value_cache = kv_cache.unbind(0)
         torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key[:num_actual_tokens],
-            value[:num_actual_tokens],
+            key,
+            value,
             key_cache,
             value_cache,
             attn_metadata.slot_mapping,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f1f26f4e8d443..1203d35fc985f 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -73,12 +73,12 @@ def __init__(
         # has the Transformer architecture (e.g., ViT).
         # FIXME(woosuk): Below are placeholder values. We need to calculate the
         # actual values from the configurations.
-        self.max_num_encoder_input_tokens = 2048
+        self.max_num_encoder_input_tokens = 16384
         # NOTE(woosuk): For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized and used, regardless of
         # the cache size. This is because the memory space for the encoder cache
         # is preallocated in the profiling run.
-        self.encoder_cache_manager = EncoderCacheManager(cache_size=2048)
+        self.encoder_cache_manager = EncoderCacheManager(cache_size=16384)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 967124fd850ea..3cf0e610ae7af 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,11 +1,11 @@
 import enum
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+from typing import List, Optional, Union
 
 import msgspec
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
+from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 
@@ -35,9 +35,8 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_data: Optional[MultiModalDataDict]
+    mm_inputs: Optional[List[MultiModalKwargs]]
     mm_placeholders: Optional[MultiModalPlaceholderDict]
-    mm_processor_kwargs: Optional[Dict[str, Any]]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
     arrival_time: float
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7335c637f0f79..0bcccda2bf329 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -51,7 +51,7 @@ def __init__(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
-            enable_lora=bool(vllm_config.lora_config))
+            lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
         # Request streams (map of request_id -> AsyncStream).
@@ -65,7 +65,12 @@ def __init__(
                                    input_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
-        self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer)
+        self.detokenizer = Detokenizer(
+            tokenizer_name=vllm_config.model_config.tokenizer,
+            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
+            trust_remote_code=vllm_config.model_config.trust_remote_code,
+            revision=vllm_config.model_config.tokenizer_revision,
+        )
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 34f99dd30ef2e..751eb3b40a68d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -67,6 +67,7 @@ def __init__(
 
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
+        start = time.time()
         num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
         )
 
@@ -80,18 +81,14 @@ def _initialize_kv_caches(self,
 
         num_cpu_blocks = 0
         self.model_executor.initialize_cache(num_gpu_blocks)
+        elapsed = time.time() - start
+        logger.info(("init engine (profile, create kv cache, "
+                     "warmup model) took %.2f seconds"), elapsed)
         return num_gpu_blocks, num_cpu_blocks
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
-
         req = Request.from_engine_core_request(request)
-        # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may
-        # take 10-50 ms, which can cause a spike in the latency. We should
-        # consider moving this to a separate thread.
-        if req.mm_data:
-            req.mm_inputs = self.mm_input_mapper.process_inputs(
-                req.mm_data, req.mm_processor_kwargs)
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index bd19d998a4adb..994e68669108e 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,5 +1,7 @@
 from typing import Dict, List, Mapping, Optional, Type, Union
 
+from typing_extensions import TypeVar
+
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
@@ -12,7 +14,8 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer_group import (
+    BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
@@ -21,6 +24,8 @@
 
 logger = init_logger(__name__)
 
+_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
+
 
 class LLMEngine:
     """Legacy LLMEngine for backwards compatibility."""
@@ -46,7 +51,7 @@ def __init__(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
-            enable_lora=bool(vllm_config.lora_config))
+            lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
@@ -169,5 +174,18 @@ def start_profile(self):
     def stop_profile(self):
         self.engine_core.profile(False)
 
-    def get_tokenizer_group(self, group_type):
-        pass
+    def get_tokenizer_group(
+        self,
+        group_type: Type[_G] = BaseTokenizerGroup,
+    ) -> _G:
+        tokenizer_group = self.tokenizer
+
+        if tokenizer_group is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+        if not isinstance(tokenizer_group, group_type):
+            raise TypeError("Invalid type of tokenizer group. "
+                            f"Expected type: {group_type}, but "
+                            f"found type: {type(tokenizer_group)}")
+
+        return tokenizer_group
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 594c973678235..7ad6882b04520 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -12,6 +12,7 @@ def __init__(
         model_config: ModelConfig,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
+        self.model_config = model_config
         self.mm_registry = mm_registry
         self.multi_modal_input_mapper = mm_registry.create_input_mapper(
             model_config)
@@ -32,7 +33,7 @@ def process_inputs(
         num_images = len(image_inputs)
         for i in range(num_images):
             mm_input = self.multi_modal_input_mapper(
-                {"image": [image_inputs[i]]},
+                {"image": image_inputs[i]},
                 mm_processor_kwargs=mm_processor_kwargs,
             )
             mm_inputs.append(mm_input)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5c1577190c75a..120fc64969552 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -7,13 +7,15 @@
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+                             MultiModalRegistry)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
+from vllm.v1.engine.mm_input_mapper import MMInputMapper
 
 
 class Processor:
@@ -39,6 +41,9 @@ def __init__(
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
+        # Multi-modal (huggingface) input mapper
+        self.mm_input_mapper = MMInputMapper(model_config)
+
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
     # asyncio loop while this is running.
@@ -96,6 +101,17 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
+        # Preprocess multi-modal data
+        if len(decoder_inputs.multi_modal_data) == 0:
+            mm_inputs = None
+        elif isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
+            mm_inputs = [decoder_inputs.multi_modal_data]
+        else:
+            mm_inputs = self.mm_input_mapper.process_inputs(
+                decoder_inputs.multi_modal_data,
+                decoder_inputs.mm_processor_kwargs,
+            )
+
         # Make Request for Detokenizer.
         detokenizer_request = DetokenizerRequest(
             request_id,
@@ -113,9 +129,8 @@ def process_inputs(
             request_id,
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
-            decoder_inputs.multi_modal_data,
+            mm_inputs,
             decoder_inputs.multi_modal_placeholders,
-            decoder_inputs.mm_processor_kwargs,
             sampling_params,
             eos_token_id,
             arrival_time,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 51fb4003e5fe0..6bc1e4d5c769f 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -45,9 +45,6 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
-        # Raw multimodal data before the mm input mapper (e.g., PIL images).
-        self.mm_data = self.inputs.multi_modal_data
-        self.mm_processor_kwargs = self.inputs.mm_processor_kwargs
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
             # FIXME(woosuk): Support other modalities.
@@ -55,7 +52,10 @@ def __init__(
         else:
             self.mm_positions = []
         # Output of the mm input mapper (e.g., image tensors).
-        self.mm_inputs: List[MultiModalKwargs] = []
+        if self.inputs.multi_modal_inputs:
+            self.mm_inputs = self.inputs.multi_modal_inputs
+        else:
+            self.mm_inputs: List[MultiModalKwargs] = []
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
@@ -64,9 +64,10 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
             inputs=token_inputs(
                 prompt_token_ids=request.prompt_token_ids,
                 prompt=request.prompt,
-                multi_modal_data=request.mm_data,
+                multi_modal_data=None,
+                multi_modal_inputs=request.mm_inputs,
                 multi_modal_placeholders=request.mm_placeholders,
-                mm_processor_kwargs=request.mm_processor_kwargs,
+                mm_processor_kwargs=None,
             ),
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
@@ -110,7 +111,7 @@ def get_finished_reason(self) -> Union[str, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
-        return len(self.mm_data) > 0
+        return len(self.mm_inputs) > 0
 
     @property
     def num_encoder_inputs(self) -> int:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
new file mode 100644
index 0000000000000..457784bb0287c
--- /dev/null
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -0,0 +1,280 @@
+# Datastructures defining an input batch
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
+
+import numpy as np
+import torch
+
+from vllm.multimodal import MultiModalKwargs
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.v1.sample.metadata import SamplingMetadata
+
+if TYPE_CHECKING:
+    from vllm.multimodal.inputs import PlaceholderRange
+
+
+@dataclass
+class CachedRequestState:
+
+    req_id: str
+    prompt_token_ids: List[int]
+    prompt: Optional[str]
+    mm_inputs: List[MultiModalKwargs]
+    mm_positions: List["PlaceholderRange"]
+    sampling_params: SamplingParams
+    generator: Optional[torch.Generator]
+
+    block_ids: List[int]
+    num_computed_tokens: int
+    output_token_ids: List[int]
+
+    @property
+    def num_tokens(self) -> int:
+        return len(self.prompt_token_ids) + len(self.output_token_ids)
+
+
+class InputBatch:
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_blocks_per_req: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = max_num_blocks_per_req
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.req_ids: List[Optional[str]] = [None] * max_num_reqs
+        self.req_id_to_index: Dict[str, int] = {}
+
+        self.token_ids_cpu = np.empty((max_num_reqs, max_model_len),
+                                      dtype=np.int32)
+        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+
+        # Attention-related.
+        self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req),
+                                       device=self.device,
+                                       dtype=torch.int32)
+        self.block_table_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.block_table_cpu = self.block_table_cpu_tensor.numpy()
+
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: Set[str] = set()
+        self.random_reqs: Set[str] = set()
+
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: Set[str] = set()
+
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: Set[str] = set()
+
+        # req_index -> generator
+        self.generators: Dict[int, torch.Generator] = {}
+
+        self.num_logprobs: Dict[str, int] = {}
+        self.prompt_logprob_reqs: Set[str] = set()
+
+    def add_request(
+        self,
+        request: "CachedRequestState",
+        req_index: Optional[int] = None,
+    ) -> None:
+        if req_index is None:
+            req_index = self.num_reqs
+        assert req_index < self.max_num_reqs
+
+        req_id = request.req_id
+        self.req_ids[req_index] = req_id
+        self.req_id_to_index[req_id] = req_index
+
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        num_blocks = len(request.block_ids)
+        self.block_table_cpu[req_index, :num_blocks] = request.block_ids
+
+        sampling_params = request.sampling_params
+        self.temperature_cpu[req_index] = sampling_params.temperature
+        if sampling_params.sampling_type == SamplingType.GREEDY:
+            self.greedy_reqs.add(req_id)
+        else:
+            self.random_reqs.add(req_id)
+
+        self.top_p_cpu[req_index] = sampling_params.top_p
+        if sampling_params.top_p < 1:
+            self.top_p_reqs.add(req_id)
+        self.top_k_cpu[req_index] = sampling_params.top_k
+        if sampling_params.top_k > 0:
+            self.top_k_reqs.add(req_id)
+
+        self.generators[req_index] = request.generator
+
+        num_logprobs = sampling_params.logprobs
+        if num_logprobs is not None and num_logprobs > 0:
+            self.num_logprobs[req_id] = num_logprobs
+        if sampling_params.prompt_logprobs:
+            self.prompt_logprob_reqs.add(req_id)
+
+    def remove_request(self, req_id: str) -> Optional[int]:
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self.req_ids[req_index] = None
+
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.prompt_logprob_reqs.discard(req_id)
+        return req_index
+
+    def clear(self) -> None:
+        self.req_ids = [None] * self.max_num_reqs
+        self.req_id_to_index.clear()
+        self.greedy_reqs.clear()
+        self.random_reqs.clear()
+        self.top_p_reqs.clear()
+        self.top_k_reqs.clear()
+        self.generators.clear()
+        self.num_logprobs.clear()
+        self.prompt_logprob_reqs.clear()
+
+    def condense(self, empty_req_indices: List[int]) -> None:
+        if self.num_reqs == 0:
+            # The batched states are empty.
+            return
+
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = self.num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+
+            # Find the smallest empty index.
+            empty_index = empty_req_indices.pop()
+            if empty_index >= last_req_index:
+                break
+
+            # Swap the states.
+            req_id = self.req_ids[last_req_index]
+            self.req_ids[empty_index] = req_id
+            self.req_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+
+            # TODO(woosuk): Optimize the copy of token_ids_cpu and
+            # block_table_cpu.
+            self.token_ids_cpu[empty_index] = self.token_ids_cpu[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table_cpu[empty_index] = self.block_table_cpu[
+                last_req_index]
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+
+    def make_sampling_metadata(
+        self,
+        skip_copy: bool = False,
+    ) -> SamplingMetadata:
+        if not skip_copy:
+            self.temperature[:self.num_reqs].copy_(
+                self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True)
+            self.top_p[:self.num_reqs].copy_(
+                self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
+            self.top_k[:self.num_reqs].copy_(
+                self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+        return SamplingMetadata(
+            temperature=self.temperature[:self.num_reqs],
+            all_greedy=self.all_greedy,
+            all_random=self.all_random,
+            top_p=self.top_p[:self.num_reqs],
+            top_k=self.top_k[:self.num_reqs],
+            no_top_p=self.no_top_p,
+            no_top_k=self.no_top_k,
+            generators=self.generators,
+            max_num_logprobs=self.max_num_logprobs,
+        )
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+
+    @property
+    def max_num_logprobs(self) -> int:
+        return max(self.num_logprobs.values()) if self.num_logprobs else 0
+
+    @property
+    def no_logprob(self) -> bool:
+        return len(self.num_logprobs) == 0
+
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return len(self.prompt_logprob_reqs) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1fa47f553dfd6..c601aca13feaf 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,14 +1,12 @@
 import gc
 import time
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 
-from vllm.compilation.compile_context import set_compile_context
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
@@ -16,16 +14,16 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MultiModalKwargs
-from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
                         is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
-    from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.scheduler import SchedulerOutput
 
 logger = init_logger(__name__)
@@ -100,7 +98,11 @@ def __init__(
                                == CompilationLevel.PIECEWISE
                                and not self.model_config.enforce_eager)
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
-        self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)]
+        # The convention is different.
+        # self.cudagraph_batch_sizes sorts in ascending order.
+        # The batch sizes in the config are in descending order.
+        self.cudagraph_batch_sizes = list(
+            reversed(self.vllm_config.compilation_config.capture_sizes))
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
@@ -257,7 +259,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
         # where M is the max_model_len.
-        token_indices = positions_np + req_indices * self.max_model_len
+        token_indices = (positions_np +
+                         req_indices * self.input_batch.token_ids_cpu.shape[1])
         token_indices = torch.from_numpy(token_indices)
         input_ids = torch.empty((total_num_scheduled_tokens, ),
                                 dtype=torch.int32,
@@ -270,9 +273,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                            out=input_ids)
 
         # Calculate the slot mapping.
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
+        # where K is the max_num_blocks_per_req and the block size is 2.
+        # NOTE(woosuk): We can't simply use `token_indices // block_size` here
+        # because M (max_model_len) is not necessarily divisible by block_size.
         block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[
-            token_indices // self.block_size]
-        block_offsets = token_indices % self.block_size
+            req_indices * self.max_num_blocks_per_req +
+            positions_np // self.block_size]
+        block_offsets = torch.from_numpy(positions_np % self.block_size)
         slot_mapping = torch.empty((total_num_scheduled_tokens, ),
                                    dtype=torch.int32,
                                    device="cpu",
@@ -548,10 +557,9 @@ def profile_run(self) -> None:
             torch.tensor([], dtype=torch.float32, device=self.device)
             for _ in range(self.num_attn_layers)
         ]
-        with set_compile_context(self.cudagraph_batch_sizes):
-            # Trigger compilation for general shape.
-            hidden_states = self._dummy_run(self.model, self.max_num_tokens,
-                                            dummy_kv_caches)
+        # Trigger compilation for general shape.
+        hidden_states = self._dummy_run(self.model, self.max_num_tokens,
+                                        dummy_kv_caches)
         logits = self.model.compute_logits(hidden_states, None)
         logits = logits[:self.max_num_tokens]
         # TODO(woosuk): Consider the memory usage of the sampler.
@@ -574,6 +582,9 @@ def capture_model(self) -> None:
         # can reuse the memory pool allocated for the large shapes.
         with graph_capture():
             for num_tokens in reversed(self.cudagraph_batch_sizes):
+                for _ in range(self.vllm_config.compilation_config.
+                               cudagraph_num_of_warmups):
+                    self._dummy_run(self.model, num_tokens, self.kv_caches)
                 self._dummy_run(self.model, num_tokens, self.kv_caches)
 
         end_time = time.perf_counter()
@@ -600,269 +611,3 @@ def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
             if batch_size <= size:
                 return size
         return None
-
-
-@dataclass
-class CachedRequestState:
-
-    req_id: str
-    prompt_token_ids: List[int]
-    prompt: Optional[str]
-    mm_inputs: List[MultiModalKwargs]
-    mm_positions: List["PlaceholderRange"]
-    sampling_params: SamplingParams
-    generator: Optional[torch.Generator]
-
-    block_ids: List[int]
-    num_computed_tokens: int
-    output_token_ids: List[int]
-
-    @property
-    def num_tokens(self) -> int:
-        return len(self.prompt_token_ids) + len(self.output_token_ids)
-
-
-class InputBatch:
-
-    def __init__(
-        self,
-        max_num_reqs: int,
-        max_model_len: int,
-        max_num_blocks_per_req: int,
-        device: torch.device,
-        pin_memory: bool,
-    ):
-        self.max_num_reqs = max_num_reqs
-        self.max_model_len = max_model_len
-        self.max_num_blocks_per_req = max_num_blocks_per_req
-        self.device = device
-        self.pin_memory = pin_memory
-
-        self.req_ids: List[Optional[str]] = [None] * max_num_reqs
-        self.req_id_to_index: Dict[str, int] = {}
-
-        self.token_ids_cpu = np.empty((max_num_reqs, max_model_len),
-                                      dtype=np.int32)
-        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
-
-        # Attention-related.
-        self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req),
-                                       device=self.device,
-                                       dtype=torch.int32)
-        self.block_table_cpu_tensor = torch.zeros(
-            (max_num_reqs, max_num_blocks_per_req),
-            device="cpu",
-            dtype=torch.int32,
-            pin_memory=pin_memory,
-        )
-        self.block_table_cpu = self.block_table_cpu_tensor.numpy()
-
-        # Sampling-related.
-        self.temperature = torch.empty((max_num_reqs, ),
-                                       dtype=torch.float32,
-                                       device=device)
-        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                  dtype=torch.float32,
-                                                  device="cpu",
-                                                  pin_memory=pin_memory)
-        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
-        self.greedy_reqs: Set[str] = set()
-        self.random_reqs: Set[str] = set()
-
-        self.top_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
-        self.top_p_reqs: Set[str] = set()
-
-        self.top_k = torch.empty((max_num_reqs, ),
-                                 dtype=torch.int32,
-                                 device=device)
-        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.int32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
-        self.top_k_reqs: Set[str] = set()
-
-        # req_index -> generator
-        self.generators: Dict[int, torch.Generator] = {}
-
-        self.num_logprobs: Dict[str, int] = {}
-        self.prompt_logprob_reqs: Set[str] = set()
-
-    def add_request(
-        self,
-        request: "CachedRequestState",
-        req_index: Optional[int] = None,
-    ) -> None:
-        if req_index is None:
-            req_index = self.num_reqs
-        assert req_index < self.max_num_reqs
-
-        req_id = request.req_id
-        self.req_ids[req_index] = req_id
-        self.req_id_to_index[req_id] = req_index
-
-        # Copy the prompt token ids and output token ids.
-        num_prompt_tokens = len(request.prompt_token_ids)
-        self.token_ids_cpu[
-            req_index, :num_prompt_tokens] = request.prompt_token_ids
-        start_idx = num_prompt_tokens
-        end_idx = start_idx + len(request.output_token_ids)
-        self.token_ids_cpu[req_index,
-                           start_idx:end_idx] = request.output_token_ids
-
-        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
-        num_blocks = len(request.block_ids)
-        self.block_table_cpu[req_index, :num_blocks] = request.block_ids
-
-        sampling_params = request.sampling_params
-        self.temperature_cpu[req_index] = sampling_params.temperature
-        if sampling_params.sampling_type == SamplingType.GREEDY:
-            self.greedy_reqs.add(req_id)
-        else:
-            self.random_reqs.add(req_id)
-
-        self.top_p_cpu[req_index] = sampling_params.top_p
-        if sampling_params.top_p < 1:
-            self.top_p_reqs.add(req_id)
-        self.top_k_cpu[req_index] = sampling_params.top_k
-        if sampling_params.top_k > 0:
-            self.top_k_reqs.add(req_id)
-
-        self.generators[req_index] = request.generator
-
-        num_logprobs = sampling_params.logprobs
-        if num_logprobs is not None and num_logprobs > 0:
-            self.num_logprobs[req_id] = num_logprobs
-        if sampling_params.prompt_logprobs:
-            self.prompt_logprob_reqs.add(req_id)
-
-    def remove_request(self, req_id: str) -> Optional[int]:
-        req_index = self.req_id_to_index.pop(req_id, None)
-        if req_index is None:
-            return None
-        self.req_ids[req_index] = None
-
-        self.greedy_reqs.discard(req_id)
-        self.random_reqs.discard(req_id)
-        self.top_p_reqs.discard(req_id)
-        self.top_k_reqs.discard(req_id)
-        self.generators.pop(req_index, None)
-        self.num_logprobs.pop(req_id, None)
-        self.prompt_logprob_reqs.discard(req_id)
-        return req_index
-
-    def clear(self) -> None:
-        self.req_ids = [None] * self.max_num_reqs
-        self.req_id_to_index.clear()
-        self.greedy_reqs.clear()
-        self.random_reqs.clear()
-        self.top_p_reqs.clear()
-        self.top_k_reqs.clear()
-        self.generators.clear()
-        self.num_logprobs.clear()
-        self.prompt_logprob_reqs.clear()
-
-    def condense(self, empty_req_indices: List[int]) -> None:
-        if self.num_reqs == 0:
-            # The batched states are empty.
-            return
-
-        # NOTE(woosuk): This function assumes that the empty_req_indices
-        # is sorted in descending order.
-        last_req_index = self.num_reqs + len(empty_req_indices) - 1
-        while empty_req_indices:
-            # Find the largest non-empty index.
-            while last_req_index in empty_req_indices:
-                last_req_index -= 1
-
-            # Find the smallest empty index.
-            empty_index = empty_req_indices.pop()
-            if empty_index >= last_req_index:
-                break
-
-            # Swap the states.
-            req_id = self.req_ids[last_req_index]
-            self.req_ids[empty_index] = req_id
-            self.req_ids[last_req_index] = None
-            self.req_id_to_index[req_id] = empty_index
-
-            # TODO(woosuk): Optimize the copy of token_ids_cpu and
-            # block_table_cpu.
-            self.token_ids_cpu[empty_index] = self.token_ids_cpu[
-                last_req_index]
-            self.num_computed_tokens_cpu[
-                empty_index] = self.num_computed_tokens_cpu[last_req_index]
-            self.block_table_cpu[empty_index] = self.block_table_cpu[
-                last_req_index]
-            self.temperature_cpu[empty_index] = self.temperature_cpu[
-                last_req_index]
-            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
-            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            generator = self.generators.pop(last_req_index, None)
-            if generator is not None:
-                self.generators[empty_index] = generator
-
-            # Decrement last_req_index since it is now empty.
-            last_req_index -= 1
-
-    def make_sampling_metadata(
-        self,
-        skip_copy: bool = False,
-    ) -> SamplingMetadata:
-        if not skip_copy:
-            self.temperature[:self.num_reqs].copy_(
-                self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True)
-            self.top_p[:self.num_reqs].copy_(
-                self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
-            self.top_k[:self.num_reqs].copy_(
-                self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
-        return SamplingMetadata(
-            temperature=self.temperature[:self.num_reqs],
-            all_greedy=self.all_greedy,
-            all_random=self.all_random,
-            top_p=self.top_p[:self.num_reqs],
-            top_k=self.top_k[:self.num_reqs],
-            no_top_p=self.no_top_p,
-            no_top_k=self.no_top_k,
-            generators=self.generators,
-            max_num_logprobs=self.max_num_logprobs,
-        )
-
-    @property
-    def num_reqs(self) -> int:
-        return len(self.req_id_to_index)
-
-    @property
-    def all_greedy(self) -> bool:
-        return len(self.random_reqs) == 0
-
-    @property
-    def all_random(self) -> bool:
-        return len(self.greedy_reqs) == 0
-
-    @property
-    def no_top_p(self) -> bool:
-        return len(self.top_p_reqs) == 0
-
-    @property
-    def no_top_k(self) -> bool:
-        return len(self.top_k_reqs) == 0
-
-    @property
-    def max_num_logprobs(self) -> int:
-        return max(self.num_logprobs.values()) if self.num_logprobs else 0
-
-    @property
-    def no_logprob(self) -> bool:
-        return len(self.num_logprobs) == 0
-
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return len(self.prompt_logprob_reqs) == 0
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index c2ebf853f41d8..cfd77d14bc697 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -25,8 +25,7 @@
 from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUBuilder,
-                                      ModelInputForGPUWithSamplingMetadata,
-                                      _get_graph_batch_size)
+                                      ModelInputForGPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
     _add_attn_metadata_broadcastable_dict,
     _add_sampling_metadata_broadcastable_dict)
@@ -465,7 +464,8 @@ def _prepare_encoder_model_input_tensors(
                 # We will be using CUDA graph replay for this decode.
                 max_len_of_block_table = self.get_max_block_per_batch()
                 batch_size = len(encoder_seq_lens)
-                graph_batch_size = _get_graph_batch_size(batch_size)
+                graph_batch_size = self.vllm_config.get_graph_batch_size(
+                    batch_size)
                 assert graph_batch_size >= batch_size
                 cuda_graph_pad_size = graph_batch_size - batch_size
                 # extend the cross_block_tables and encoder_seq_lens to match
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 741bdf6346c49..94ebf926bf5a6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -18,7 +18,6 @@
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
-from vllm.compilation.compile_context import set_compile_context
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_kv_transfer_group, get_pp_group
@@ -63,16 +62,7 @@
 logger = init_logger(__name__)
 
 LORA_WARMUP_RANK = 8
-_BATCH_SIZE_ALIGNMENT = 8
-# all the token sizes that **can** be captured by cudagraph.
-# they can be arbitrarily large.
-# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192.
-# the actual sizes to capture will be determined by the model,
-# depending on the model's max_num_seqs.
-# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)
-]
+
 _NUM_WARMUP_ITERS = 2
 
 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
@@ -763,7 +753,6 @@ def _use_captured_graph(self,
                             max_decode_seq_len: int,
                             max_encoder_seq_len: int = 0) -> bool:
         return (decode_only and not self.runner.model_config.enforce_eager
-                and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
                 and max_decode_seq_len <= self.runner.max_seq_len_to_capture
                 and max_encoder_seq_len <= self.runner.max_seq_len_to_capture
                 and batch_size <= self.runner.max_batchsize_to_capture)
@@ -811,7 +800,7 @@ def _get_cuda_graph_pad_size(self,
                                         max_encoder_seq_len):
             return -1
 
-        graph_batch_size = _get_graph_batch_size(batch_size)
+        graph_batch_size = VllmConfig.get_graph_batch_size(batch_size)
         assert graph_batch_size >= batch_size
         return graph_batch_size - batch_size
 
@@ -1023,7 +1012,7 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.max_batchsize_to_capture = _get_max_graph_batch_size(
+        self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size(
             self.scheduler_config.max_num_seqs)
 
         self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
@@ -1333,14 +1322,7 @@ def profile_run(self) -> None:
                 dtype=self.model_config.dtype,
                 device=self.device)
 
-        graph_batch_size = self.max_batchsize_to_capture
-        batch_size_capture_list = [
-            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
-        ]
-        if self.model_config.enforce_eager:
-            batch_size_capture_list = []
-        with set_compile_context(batch_size_capture_list):
-            self.execute_model(model_input, kv_caches, intermediate_tensors)
+        self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
         return
 
@@ -1459,18 +1441,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                 dtype=self.model_config.dtype,
                 device=self.device)
 
-        graph_batch_size = self.max_batchsize_to_capture
-        batch_size_capture_list = [
-            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
-        ]
-
         with self.attn_state.graph_capture(
                 max_batch_size), graph_capture() as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
             for virtual_engine in range(
                     self.parallel_config.pipeline_parallel_size):
-                for batch_size in reversed(batch_size_capture_list):
+                for batch_size in \
+                    self.vllm_config.compilation_config.capture_sizes:
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,
@@ -1805,6 +1783,9 @@ def need_recv_kv(self, model_input, kv_caches) -> bool:
             kv_caches: vLLM's paged memory
         """
 
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
@@ -1812,9 +1793,6 @@ def need_recv_kv(self, model_input, kv_caches) -> bool:
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 
-        if self.vllm_config.kv_transfer_config is None:
-            return False
-
         return self.vllm_config.kv_transfer_config.is_kv_consumer and (
             not is_profile_run) and is_prefill_run
 
@@ -1830,6 +1808,9 @@ def need_send_kv(self, model_input, kv_caches) -> bool:
             kv_caches: vLLM's paged memory
         """
 
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
@@ -1837,9 +1818,6 @@ def need_send_kv(self, model_input, kv_caches) -> bool:
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 
-        if self.vllm_config.kv_transfer_config is None:
-            return False
-
         return self.vllm_config.kv_transfer_config.is_kv_producer and (
             not is_profile_run) and is_prefill_run
 
@@ -1994,37 +1972,3 @@ def forward(
             return self.output_buffers["hidden_states"]
 
         return self.output_buffers
-
-
-def _get_graph_batch_size(batch_size: int) -> int:
-    """Returns the padded batch size given actual batch size.
-
-    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
-    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
-    """
-    if batch_size <= 2:
-        return batch_size
-    elif batch_size <= 4:
-        return 4
-    else:
-        return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
-                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-
-
-def _get_max_graph_batch_size(max_num_seqs: int) -> int:
-    """
-    max_num_seqs: Maximum number of sequences in a batch.
-    _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture.
-
-    pad the max_num_seqs if necessary by calling _get_graph_batch_size,
-    which will deal with some edge cases like 1, 2, 4.
-
-    if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded size.
-    if not, it means the padded size is larger than the largest size in
-    _BATCH_SIZES_TO_CAPTURE, return the largest size in _BATCH_SIZES_TO_CAPTURE.
-    """
-    padded_size = _get_graph_batch_size(max_num_seqs)
-    if padded_size in _BATCH_SIZES_TO_CAPTURE:
-        return padded_size
-    assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 3ee0fb4dc943e..3ca0d88a42183 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -817,7 +817,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 205f8a337ce6c..0bf522d5333ed 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -489,7 +489,7 @@ def model_profile_run():
                 block_size = cache_config.block_size
                 seq_num_blocks = (seq_len + block_size - 1) // block_size
 
-                seq_data, dummy_multi_modal_data = input_registry \
+                dummy_data = input_registry \
                     .dummy_data_for_profiling(model_config,
                                               seq_len,
                                               mm_registry)
@@ -498,11 +498,11 @@ def model_profile_run():
                 seq = SequenceGroupMetadata(
                     request_id=str(group_id),
                     is_prompt=True,
-                    seq_data={group_id: seq_data},
+                    seq_data={group_id: dummy_data.seq_data},
                     sampling_params=sampling_params,
                     block_tables=block_tables,
                     lora_request=None,
-                    multi_modal_data=dummy_multi_modal_data)
+                    multi_modal_data=dummy_data.multi_modal_data)
                 seqs.append(seq)
 
             self.model_runner.block_size = tmp_cache_config.block_size
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index f43635464ef00..5f71ec0c14df8 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # Reminder: Please update docs/source/usage/compatibility_matrix.rst
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7c0bc5a678956..6d00102e0a324 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -439,7 +439,7 @@ def init_worker(self, *args, **kwargs):
         Here we inject some common logic before initializing the worker.
         Arguments are passed to the worker class constructor.
         """
-        enable_trace_function_call_for_thread()
+        enable_trace_function_call_for_thread(self.vllm_config)
 
         # see https://github.com/NVIDIA/nccl/issues/1234
         os.environ['NCCL_CUMEM_ENABLE'] = '0'