diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
new file mode 100644
index 0000000000000..d06604f96f2b8
--- /dev/null
+++ b/.buildkite/run-gh200-test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference.py
+'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index df4fa7a6ee9ba..b563c96343f92 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -181,14 +181,14 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
-- label: Examples Test # 15min
+- label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/entrypoints
   - examples/
   commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+    - pip install tensorizer # for tensorizer test
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_chat.py
@@ -198,7 +198,10 @@ steps:
     - python3 offline_inference_vision_language_multi_image.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
-    - python3 offline_profile.py --model facebook/opt-125m
+    - python3 offline_inference_classification.py
+    - python3 offline_inference_embedding.py
+    - python3 offline_inference_scoring.py
+    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
@@ -221,8 +224,12 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
   - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
+  - tests/model_executor/test_guided_processors
+  commands: 
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
 
 - label: Speculative decoding tests # 30min
   source_file_dependencies:
@@ -321,7 +328,7 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 30min
+- label: Basic Models Test # 24min
   source_file_dependencies:
   - vllm/
   - tests/models
@@ -331,7 +338,7 @@ steps:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
-- label: Language Models Test (Standard) # 42min
+- label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -342,7 +349,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
 
-- label: Language Models Test (Extended) # 50min
+- label: Language Models Test (Extended) # 1h10min
   optional: true
   source_file_dependencies:
   - vllm/
@@ -353,7 +360,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 26min
+- label: Multi-Modal Models Test (Standard) # 28min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -369,7 +376,7 @@ steps:
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) # 1h15m
+- label: Multi-Modal Models Test (Extended) 1 # 1h16m
   optional: true
   source_file_dependencies:
   - vllm/
@@ -380,14 +387,24 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
   optional: true
@@ -422,11 +439,11 @@ steps:
   - tests/distributed/
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 
 - label: Distributed Tests (2 GPUs) # 40min
   #mirror_hardwares: [amd]
@@ -445,12 +462,12 @@ steps:
   commands:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
@@ -540,7 +557,7 @@ steps:
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: LM Eval Large Models # optional
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c78cdc77a7e42..51b49a18dddf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
@@ -205,7 +206,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -222,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        GIT_SHALLOW FALSE
     )
   endif()
   FetchContent_MakeAvailable(cutlass)
@@ -240,7 +241,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
+    "csrc/cutlass_extensions/common.cpp")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -270,11 +274,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+  # For Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+             "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -283,12 +290,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
+                     "later if you intend on running FP8 sparse or quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building cutlass_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -300,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
     "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@@ -403,7 +410,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
diff --git a/Dockerfile b/Dockerfile
index b38113f524a17..391ec2182a589 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
@@ -46,9 +47,14 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
+ARG TARGETPLATFORM
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
-
 #################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip uninstall -y torch && \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
 COPY examples examples
 #################### vLLM installation IMAGE ####################
 
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
-
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
@@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
-
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+    else \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+    fi
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/README.md b/README.md
index ed5161ccffb45..93b71ddaccc61 100644
--- a/README.md
+++ b/README.md
@@ -134,3 +134,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 * For coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+
+## Media Kit
+
+* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 3256692142c5e..4eb0e1f8ac903 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -781,6 +781,7 @@ def main(args: argparse.Namespace):
     backend = args.backend
     model_id = args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
 
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
@@ -790,6 +791,7 @@ def main(args: argparse.Namespace):
         base_url = f"http://{args.host}:{args.port}"
 
     tokenizer = get_tokenizer(tokenizer_id,
+                              tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
 
     if args.dataset is not None:
@@ -1210,5 +1212,15 @@ def main(args: argparse.Namespace):
         "from the sampled HF dataset.",
     )
 
+    parser.add_argument(
+        '--tokenizer-mode',
+        type=str,
+        default="auto",
+        choices=['auto', 'slow', 'mistral'],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        'always use the slow tokenizer. \n* '
+        '"mistral" will always use the `mistral_common` tokenizer.')
+
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000000000..3d1c5e392f9e2
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,384 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass sparse impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass sparse with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
new file mode 100644
index 0000000000000..ef06fcd6604dd
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -0,0 +1,96 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 63cf5d50cac75..d0353bc8cb42a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,6 +8,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
@@ -17,31 +18,6 @@
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
 
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
@@ -386,4 +362,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 25ec9d6028627..d58fb0bf86374 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -40,4 +40,4 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
-}
+}
\ No newline at end of file
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
new file mode 100644
index 0000000000000..ef91f9f8eb529
--- /dev/null
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -0,0 +1,173 @@
+import pickle as pkl
+import time
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Iterable, List, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@dataclass
+class bench_params_t:
+    num_tokens: int
+    hidden_size: int
+    add_residual: bool
+    dtype: torch.dtype
+
+    def description(self):
+        return (f'N {self.num_tokens} '
+                f'x D {self.hidden_size} '
+                f'x R {self.add_residual} '
+                f'x DT {self.dtype}')
+
+
+def get_bench_params() -> List[bench_params_t]:
+    ## Test Fixtures
+    NUM_TOKENS = [2**x for x in range(11)]
+    HIDDEN_SIZES = list(range(1024, 8129, 1024))
+    ADD_RESIDUAL = [True, False]
+    DTYPES = [torch.bfloat16, torch.float]
+
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    bench_params = list(map(lambda x: \
+        bench_params_t(x[0], x[1], x[2], x[3]), combinations))
+    return bench_params
+
+
+# Reference impls
+def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                      residual: Optional[torch.Tensor],
+                      quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _, _ = ops.scaled_int8_quant(torch_out)
+
+
+def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                     residual: Optional[torch.Tensor],
+                     quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = ops.scaled_fp8_quant(torch_out)
+
+
+def fused_impl(
+        rms_norm_layer: RMSNorm,  # this stores the weights
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        quant_dtype: torch.dtype):
+    out, _ = ops.rms_norm_dynamic_per_token_quant(x,
+                                                  rms_norm_layer.weight,
+                                                  1e-6,
+                                                  quant_dtype,
+                                                  residual=residual)
+
+
+# Bench functions
+def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
+             quant_dtype: torch.dtype, label: str, sub_label: str,
+             fn: Callable, description: str) -> TMeasurement:
+
+    min_run_time = 1
+
+    globals = {
+        "rms_norm_layer": rms_norm_layer,
+        "x": x,
+        "residual": residual,
+        "quant_dtype": quant_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+def bench(params: bench_params_t, label: str, sub_label: str) \
+        -> Iterable[TMeasurement]:
+
+    # Make inputs
+    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    # Make inputs
+    scale = 1 / params.hidden_size
+    x = torch.randn(params.num_tokens,
+                    params.hidden_size,
+                    dtype=params.dtype,
+                    device='cuda') * scale
+    residual = (torch.randn_like(x) * scale).to(device='cuda') \
+            if params.add_residual else None
+
+    timers = []
+
+    # unfused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label,
+                 unfused_int8_impl, "unfused_int8_impl"))
+
+    # unfused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 unfused_fp8_impl, "unfused_fp8_impl"))
+
+    # fused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
+                 "fused_int8_impl"))
+
+    # fused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 fused_impl, "fused_fp8_impl"))
+
+    print_timers(timers)
+
+    return timers
+
+
+# launch bench
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def main():
+    torch.set_default_device('cuda')
+    bench_params = get_bench_params()
+
+    timers = []
+    for bp in tqdm(bench_params):
+        timers.extend(
+            bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+    print_timers(timers)
+
+    # pickle all the results
+    timestamp = int(time.time())
+    with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
+        pkl.dump(timers, f)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
new file mode 100644
index 0000000000000..baa5de0fff1bd
--- /dev/null
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -0,0 +1,262 @@
+import itertools
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+
+from vllm import _custom_ops as vllm_ops
+
+
+class HuggingFaceRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    output = naive_norm(x, residual)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size,
+                    seq_len,
+                    hidden_size,
+                    dtype=dtype,
+                    device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+
+    output_naive = rmsnorm_naive(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"VLLM output={output_vllm}")
+
+    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
+                      rtol=1e-2) and torch.allclose(
+                          output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(
+    itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=
+            f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+            args={},
+        ))
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+
+        x = torch.randn(batch_size,
+                        seq_len,
+                        hidden_size,
+                        dtype=dtype,
+                        device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=4096,
+        help="Hidden size (2nd dimension) of the sequence",
+    )
+    parser.add_argument("--use-residual",
+                        action="store_true",
+                        help="Whether to use residual connection")
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=args.batch_size,
+                   seq_len=args.seq_len,
+                   hidden_size=args.hidden_size,
+                   use_residual=args.use_residual)
+
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
new file mode 100644
index 0000000000000..ba9f40a230c8e
--- /dev/null
+++ b/csrc/core/math.hpp
@@ -0,0 +1,7 @@
+#include <climits>
+#include <iostream>
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.cpp b/csrc/cutlass_extensions/common.cpp
new file mode 100644
index 0000000000000..3d2093ab94297
--- /dev/null
+++ b/csrc/cutlass_extensions/common.cpp
@@ -0,0 +1,11 @@
+#include "cutlass_extensions/common.hpp"
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 0000000000000..85e359aa57113
--- /dev/null
+++ b/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+int32_t get_sm_version_num();
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 95764ecddc79f..fcc17c7727f94 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -36,13 +36,13 @@ struct ScaledEpilogueBase {
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index a634e1c3d4886..03414b7e1ae93 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -14,6 +14,20 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+// TODO(luka/varun): use FP8_TYPE macro after refactoring
+#ifndef USE_ROCM
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
diff --git a/csrc/ops.h b/csrc/ops.h
index ea001190bc202..c145e4eda0845 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -66,6 +66,14 @@ void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
                                          torch::Tensor& weight,
                                          torch::Tensor& scale, double epsilon);
 
+void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
+                                      torch::Tensor const& input,
+                                      torch::Tensor const& weight,
+                                      torch::Tensor& scales,
+                                      double const epsilon,
+                                      std::optional<torch::Tensor> scale_ub,
+                                      std::optional<torch::Tensor> residual);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
@@ -154,6 +162,15 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
+                              torch::Tensor const& b, torch::Tensor const& e,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
+                                   torch::Tensor& e, torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
deleted file mode 100644
index bf04bb400790f..0000000000000
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include <climits>
-
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index d03242f44ab1d..75681f7f37820 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,7 +21,8 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 33581a63d4c3d..8190277997161 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -24,7 +24,8 @@
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 97a969cf5e3e0..4f7b6588ef3f7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -3,6 +3,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 
+#include "cutlass_extensions/common.hpp"
+
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -79,16 +81,6 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
-int32_t get_sm_version_num() {
-  int32_t major_capability, minor_capability;
-  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
-                         0);
-  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
-                         0);
-  int32_t version_num = major_capability * 10 + minor_capability;
-  return version_num;
-}
-
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d7c0297d5333f..15bd5b6ed1564 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -1,6 +1,9 @@
 #pragma once
 
+#include "quantization/vectorization.cuh"
+
 #include <cmath>
+#include <c10/core/ScalarType.h>
 
 #ifndef USE_ROCM
   #include <c10/util/Float8_e4m3fn.h>
@@ -15,6 +18,7 @@ using FP8_TYPE = c10::Float8_e4m3fnuz;
 // issue when running dynamic quantization. Here use 224.0f for rocm.
 constexpr auto FP8_E4M3_MAX = 224.0f;
 #endif
+constexpr static auto kFp8Type = c10::CppTypeToScalarType<FP8_TYPE>::value;
 
 namespace vllm {
 
@@ -89,22 +93,6 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   }
 }
 
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
-};
-
-typedef struct __align__(4) {
-  FP8_TYPE x;
-  FP8_TYPE y;
-  FP8_TYPE z;
-  FP8_TYPE w;
-}
-float8x4_t;
-
 template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
@@ -139,10 +127,10 @@ __device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
+  using float8x4_t = q8x4_t<FP8_TYPE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
 
   int64_t const num_vec_elems = num_elems >> 2;
 
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
new file mode 100644
index 0000000000000..3c4f183bf4b59
--- /dev/null
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -0,0 +1,160 @@
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../dispatch_utils.h"
+#include "layernorm_utils.cuh"
+#include "quant_conversions.cuh"
+
+namespace vllm {
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void rms_norm_dynamic_per_token_quant_vec(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute rms
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute scale
+  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
+                                                     has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
+                                     has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert token_scale for exact match with FBGemm
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
+                                     has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__global__ void rms_norm_dynamic_per_token_quant_kernel(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;
+
+  if (can_vectorize) {
+    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
+                                                has_residual>(
+        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
+        hidden_size, residual);
+  }
+
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute RMS
+  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
+                                            var_epsilon, residual);
+  // Compute Scale
+  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert s_token_scale for exact match with FBGemm
+    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+}  // namespace vllm
+
+// Residual add + RMS norm + dynamic per token
+template <typename scalar_in_t>
+void rms_norm_dynamic_per_token_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual) {
+  int32_t hidden_size = input.size(-1);
+  int32_t num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const float min_scaling_factor =
+      out.dtype() == torch::kInt8
+          ? std::numeric_limits<float>::epsilon()
+          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
+
+  if (residual.has_value()) {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        true>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size,
+                  residual->data_ptr<scalar_in_t>());
+        });
+
+  } else {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        false>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
+        });
+  }
+}
+
+void rms_norm_dynamic_per_token_quant(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
+        rms_norm_dynamic_per_token_quant_dispatch<scalar_t>(
+            out, input, weight, scales, var_epsilon, scale_ub, residual);
+      });
+}
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
new file mode 100644
index 0000000000000..cec6b54edb569
--- /dev/null
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -0,0 +1,327 @@
+#pragma once
+
+/**
+ * __device__ layernorm utilities.
+ */
+
+#include "quantization/vectorization.cuh"
+#include "quant_conversions.cuh"
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// has_residual must be true, if residual is not a nullptr
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  // sum of squares
+  float ss = 0.0f;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    ss += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  float block_absmax_val_maybe = 0.0f;
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // Shared memory store
+    all_token_scales[blockIdx.x] = scale;  // Global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+      residual[token_offset + i] = static_cast<scalar_t>(x);
+    }
+    // Norm
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    // Quant
+    output[token_offset + i] =
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
+  }
+}
+
+namespace vectorized {
+
+// Compute 1.0/rms(input)
+// hidden_size must be a multiple of 4
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  // sum of squares
+  float ss = 0.0f;
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    ss += x.x * x.x;
+    ss += x.y * x.y;
+    ss += x.z * x.z;
+    ss += x.w * x.w;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+// Vectorized version of vllm::compute_dynamic_per_token_scales
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+  float block_absmax_val_maybe = 0.0f;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // shared memory store
+    all_token_scales[blockIdx.x] = scale;  // global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  q8x4_t<scalar_out_t>* vec_output =
+      reinterpret_cast<q8x4_t<scalar_out_t>*>(&output[token_offset]);
+  vec4_t<scalar_t>* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
+  }
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+// TODO(luka/varun) extract into type-agnostic vectorized quant function to
+//  replace scaled_fp8_conversion_vec
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> const in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+      // Update residual
+      r.x = static_cast<scalar_t>(x.x);
+      r.y = static_cast<scalar_t>(x.y);
+      r.z = static_cast<scalar_t>(x.z);
+      r.w = static_cast<scalar_t>(x.w);
+      vec_residual[i] = r;
+    }
+
+    q8x4_t<scalar_out_t> out;
+    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.x * rms) * w.x, scale);
+    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.y * rms) * w.y, scale);
+    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.z * rms) * w.z, scale);
+    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.w * rms) * w.w, scale);
+    vec_output[i] = out;
+  }
+}
+
+}  // namespace vectorized
+
+}  // namespace vllm
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
new file mode 100644
index 0000000000000..f8a9872226a3a
--- /dev/null
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -0,0 +1,81 @@
+#pragma once
+
+/**
+ * __device__ helper functions to deal with float -> quant datatype conversion
+ */
+
+#include "quantization/vectorization.cuh"
+// TODO(luka/varun):refactor common.cuh to use this file instead
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+// TODO(luka/varun): combine into common utilities for int8
+//  (with int8_quant_kernels.cu)
+static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
+#ifdef USE_ROCM
+  static const float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  static const float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  // round
+  float dst = std::nearbyint(x);
+  // saturate
+  dst = std::clamp(dst, i8_min, i8_max);
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+static __device__ __forceinline__ FP8_TYPE float_to_fp8(float const x) {
+  float const r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+  return static_cast<FP8_TYPE>(r);
+}
+
+template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
+struct ScaledQuant;
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, int8_t>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_int8_rn(x * scale);
+    } else {
+      return float_to_int8_rn(x / scale);
+    }
+  }
+};
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, FP8_TYPE>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_fp8(x * scale);
+    } else {
+      return float_to_fp8(x / scale);
+    }
+  }
+};
+
+template <typename scalar_t, typename quant_type_t, bool is_scale_inverted>
+__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output,
+                                        scalar_t const* __restrict__ input,
+                                        float const scale, int const tid,
+                                        int const num_elements,
+                                        int const step) {
+  for (int i = tid; i < num_elements; i += step) {
+    output[i] = ScaledQuant<quant_type_t, is_scale_inverted>(input[i], scale);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
new file mode 100644
index 0000000000000..44c999130f756
--- /dev/null
+++ b/csrc/quantization/vectorization.cuh
@@ -0,0 +1,33 @@
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+// TODO(luka/varun) use FP8_TYPE instead after refactoring
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+namespace vllm {
+
+// Vectorization containers
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+template <typename quant_type_t>
+struct __align__(4) q8x4_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t x;
+  quant_type_t y;
+  quant_type_t z;
+  quant_type_t w;
+};
+
+}  // namespace vllm
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
new file mode 100644
index 0000000000000..218c5317b4de6
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
@@ -0,0 +1,163 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include "sparse_scaled_mm_c3x.cuh"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename ElementA_, typename ElementAcc_>
+bool cutlass_sparse_compress(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                             torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  int m = a.size(0);
+  int k = a.size(1);
+
+  // Sparse kernel setup; this kernel is not used for matmul,
+  // but just for setting up the compressor utility
+  // A matrix configuration
+  using ElementA = ElementA_;
+  using LayoutTagA = cutlass::layout::RowMajor;
+  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  // B matrix configuration
+  using ElementB = ElementA;
+  using LayoutTagB = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  // C/D matrix configuration
+  using ElementC = float;
+  using LayoutTagC = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Core kernel configurations
+  using ElementAccumulator = ElementAcc_;
+  using TileShape = Shape<_128, _128, _128>;
+  using TileShapeRef = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using KernelSchedule = typename std::conditional<
+      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
+      cutlass::gemm::KernelTmaWarpSpecialized>::type;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using ProblemShape = Shape<int, int, int, int>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
+          AlignmentC, ElementC, LayoutTagC, AlignmentC,
+          EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
+          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  using StrideE = StrideA;
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+
+  // The n (=1) dimension does not matter for the compressor
+  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // Offline compressor kernel
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
+
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig,
+          cutlass::arch::Sm90>;
+
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA stride_A;
+  stride_A =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+
+  CompressorUtility compressor_utility(prob_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(
+      a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, stride_A, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a) {
+  if (a.dtype() == torch::kBFloat16) {
+    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_nzs, a_meta,
+                                                               a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return cutlass_sparse_compress<cutlass::half_t, float>(a_nzs, a_meta, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_nzs, a_meta,
+                                                                 a);
+  } else if (a.dtype() == torch::kInt8) {
+    return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
+  }
+  return false;
+}
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
new file mode 100644
index 0000000000000..d23d937b6ac28
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_entry.cu
@@ -0,0 +1,42 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a);
+#endif
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                   torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && a_meta.dim() == 2 && a_nzs.dim() == 2);
+  TORCH_CHECK(a.size(0) == a_nzs.size(0) && a.size(0) == a_meta.size(0) &&
+              a_nzs.size(1) * 2 == a.size(1) &&
+              a_meta.size(1) * 2 * 4 == a.size(1));
+  // Considering elemsPerMetaElem = 8b / 2b_per_nz = 4
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && a_nzs.stride(1) == 1 &&
+              a_meta.stride(1) == 1);  // Row-major
+  TORCH_CHECK(a.stride(0) % 8 == 0);   // 8 Byte Alignment for Compression
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
new file mode 100644
index 0000000000000..b50e9a3a2c240
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -0,0 +1,303 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#include "sparse_scaled_mm_c3x.cuh"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& bt_nzs,
+                                    torch::Tensor const& bt_meta,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  using Cutlass3xGemm1 =
+      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm2 =
+      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm3 =
+      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm4 =
+      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm5 =
+      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm6 =
+      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm7 =
+      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm8 =
+      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = bt_nzs.size(0);
+  uint32_t const m = a.size(0);  // Batch size
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096 || n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    if (n == 6144 || n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+
+  // Otherwise the default heuristic
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // n in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // n in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // n in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  TORCH_CHECK(a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
+                                            EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {  // a.dtype() == torch::kBFloat16
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales);
+  }
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..10178b53f4af0
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -0,0 +1,496 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
+   for NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_sparse_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc = AccType;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
+          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Interface stride expected from the argument a (will get transposed)
+  // We compute C^T = B^T * A^T, but we assume B is transposed before
+  // compression and hence the bt_* naming
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  auto layout_A = make_cute_layout<StrideA>(a, "A");
+  auto layout_D = make_cute_layout<StrideD>(out, "D");
+
+  // Transpose A and D
+  // A doesn't need to be transposed since cutlass expects a NxK matrix
+  // for B (which is At)
+  auto stride_At = layout_A.stride();
+  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{
+      static_cast<int>(bt_nzs.size(0)), static_cast<int>(size<0>(layout_A)),
+      static_cast<int>(size<1>(layout_A)), 1};
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{
+      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, stride_Dt, c_ptr, stride_Dt};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default {};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<half_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule,
+                             float>;
+};
+
+//////////////////////// Cherry-Picking Kernels ////////////////////////
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_1 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_2 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_3 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_4 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_5 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_6 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_7 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_8 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _128>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+////////////////////////////////////////////////////////////////////////
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<int8_t, OutType, Epilogue> {
+  // For M > 128 and any N
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
new file mode 100644
index 0000000000000..4c930b603c9e4
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -0,0 +1,59 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias);
+#endif
+
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
+              c.stride(1) == 1);            // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
+                                  bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 4e64b9c92773a..88a4e60c75cbe 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -128,6 +128,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
            &fused_add_rms_norm_static_fp8_quant);
 
+  // Fused Layernorm + Quant kernels
+  ops.def(
+      "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual) -> ()");
+  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
+           &rms_norm_dynamic_per_token_quant);
+
   // Rotary embedding
   // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
   ops.def(
@@ -313,6 +321,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
+  ops.def(
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
+      "                         Tensor bt_nzs,"
+      "                         Tensor bt_meta, Tensor a_scales,"
+      "                         Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
+
+  // CUTLASS sparse matrix compressor
+  ops.def(
+      "cutlass_sparse_compress_entry(Tensor! a_nzs, Tensor! a_meta,"
+      "                              Tensor a) -> bool");
+  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/usage/disagg_prefill/abstraction.jpg
new file mode 100644
index 0000000000000..1a99e3ed8cf5f
Binary files /dev/null and b/docs/source/assets/usage/disagg_prefill/abstraction.jpg differ
diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/usage/disagg_prefill/overview.jpg
new file mode 100644
index 0000000000000..f029b4c05c808
Binary files /dev/null and b/docs/source/assets/usage/disagg_prefill/overview.jpg differ
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
new file mode 100644
index 0000000000000..b58456ecc6da8
--- /dev/null
+++ b/docs/source/design/multiprocessing.md
@@ -0,0 +1,195 @@
+# Python Multiprocessing
+
+## Debugging
+
+Please see the [Debugging
+Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing)
+page for information on known issues and how to solve them.
+
+## Introduction
+
+*Note that source code references are to the state of the code at the time of writing in December, 2024.*
+
+The use of Python multiprocessing in vLLM is complicated by:
+
+- The use of vLLM as a library and the inability to control the code using vLLM
+- Varying levels of incompatibilities between multiprocessing methods and vLLM
+  dependencies
+
+This document describes how vLLM deals with these challenges.
+
+## Multiprocessing Methods
+
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+
+- `spawn` - spawn a new Python process. This will be the default as of Python
+  3.14.
+
+- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
+  in Python versions prior to 3.14.
+
+- `forkserver` - Spawn a server process that will fork a new process on request.
+
+### Tradeoffs
+
+`fork` is the fastest method, but is incompatible with dependencies that use
+threads.
+
+`spawn` is more compatible with dependencies, but can be problematic when vLLM
+is used as a library. If the consuming code does not use a `__main__` guard (`if
+__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
+spawns a new process. This can lead to infinite recursion, among other problems.
+
+`forkserver` will spawn a new server process that will fork new processes on
+demand. This unfortunately has the same problem as `spawn` when vLLM is used as
+a library. The server process is created as a spawned new process, which will
+re-execute code not protected by a `__main__` guard.
+
+For both `spawn` and `forkserver`, the process must not depend on inheriting any
+global state as would be the case with `fork`.
+
+## Compatibility with Dependencies
+
+Multiple vLLM dependencies indicate either a preference or requirement for using
+`spawn`:
+
+- <https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing>
+- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
+- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
+
+It is perhaps more accurate to say that there are known problems with using
+`fork` after initializing these dependencies.
+
+## Current State (v0)
+
+The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
+
+When we know we own the process because the `vllm` command was used, we use
+`spawn` because it's the most widely compatible.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
+
+The `multiproc_xpu_executor` forces the use of `spawn`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
+
+There are other miscellaneous places hard-coding the use of `spawn`:
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
+
+Related PRs:
+
+- <https://github.com/vllm-project/vllm/pull/8823>
+
+## Prior State in v1
+
+There was an environment variable to control whether multiprocessing is used in
+the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L452-L454>
+
+When it was enabled, the v1 `LLMEngine` would create a new process to run the
+engine core.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
+- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45
+
+It was off by default for all the reasons mentioned above - compatibility with
+dependencies and code using vLLM as a library.
+
+### Changes Made in v1
+
+There is not an easy solution with Python's `multiprocessing` that will work
+everywhere. As a first step, we can get v1 into a state where it does "best
+effort" choice of multiprocessing method to maximize compatibility.
+
+- Default to `fork`.
+- Use `spawn` when we know we control the main process (`vllm` was executed).
+- If we detect `cuda` was previously initialized, force `spawn` and emit a
+  warning. We know `fork` will break, so this is the best we can do.
+
+The case that is known to still break in this scenario is code using vLLM as a
+library that initializes `cuda` before calling vLLM. The warning we emit should
+instruct users to either add a `__main__` guard or to disable multiprocessing.
+
+If that known-failure case occurs, the user will see two messages that explain
+what is happening. First, a log message from vLLM:
+
+```
+    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+      initialized. We must use the `spawn` multiprocessing start method. Setting
+      VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+      https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+      for more information.
+```
+
+Second, Python itself will raise an exception with a nice explanation:
+
+```
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+## Alternatives Considered
+
+### Detect if a `__main__` guard is present
+
+It has been suggested that we could behave better if we could detect whether
+code using vLLM as a library has a `__main__` guard in place. This [post on
+stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
+was from a library author facing the same question.
+
+It is possible to detect whether we are in the original, `__main__` process, or
+a subsequent spawned process. However, it does not appear to be straight forward
+to detect whether a `__main__` guard is present in the code.
+
+This option has been discarded as impractical.
+
+### Use `forkserver`
+
+At first it appears that `forkserver` is a nice solution to the problem.
+However, the way it works presents the same challenges that `spawn` does when
+vLLM is used as a library.
+
+### Force `spawn` all the time
+
+One way to clean this up is to just force the use of `spawn` all the time and
+document that the use of a `__main__` guard is required when using vLLM as a
+library. This would unfortunately break existing code and make vLLM harder to
+use, violating the desire to make the `LLM` class as easy as possible to use.
+
+Instead of pushing this on our users, we will retain the complexity to do our
+best to make things work.
+
+## Future Work
+
+We may want to consider a different worker management approach in the future
+that works around these challenges.
+
+1. We could implement something `forkserver`-like, but have the process manager
+   be something we initially launch by running our own subprocess and a custom
+   entrypoint for worker management (launch a `vllm-manager` process).
+
+2. We can explore other libraries that may better suit our needs. Examples to
+   consider:
+
+- <https://github.com/joblib/loky>
diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
index b0566478a2fac..d5aaed4f8ecf8 100644
--- a/docs/source/getting_started/debugging.md
+++ b/docs/source/getting_started/debugging.md
@@ -134,6 +134,60 @@ A multi-node environment is more complicated than a single-node one. If you see
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
 ```
 
+## Python multiprocessing
+
+### `RuntimeError` Exception
+
+If you have seen a warning in your logs like this:
+
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+    initialized. We must use the `spawn` multiprocessing start method. Setting
+    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    for more information.
+```
+
+or an error from Python that looks like this:
+
+```console
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+then you must update your Python code to guard usage of `vllm` behind a `if
+__name__ == '__main__':` block. For example, instead of this:
+
+```python
+import vllm
+
+llm = vllm.LLM(...)
+```
+
+try this instead:
+
+```python
+if __name__ == '__main__':
+    import vllm
+
+    llm = vllm.LLM(...)
+```
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
diff --git a/docs/source/index.md b/docs/source/index.md
index 1e55847ab2b0b..13dbc6c57686c 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -113,6 +113,7 @@ usage/faq
 usage/engine_args
 usage/env_vars
 usage/usage_stats
+usage/disagg_prefill
 ```
 
 ```{toctree}
@@ -178,6 +179,7 @@ design/plugin_system
 design/input_processing/model_inputs_index
 design/kernel/paged_attention
 design/multimodal/multimodal_index
+design/multiprocessing
 ```
 
 % For Developers: contributing to the vLLM project
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index f421707434280..41a042da5510f 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -5,19 +5,19 @@
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
 In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface.
-These models use a {class}`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input
+These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
 
 ```{note}
 We currently support pooling models primarily as a matter of convenience.
-As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
+As shown in the {ref}`Compatibility Matrix <compatibility_matrix>`, most vLLM features are not applicable to
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 ```
 
 ## Offline Inference
 
 The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+See {ref}`Engine Arguments <engine_args>` for a list of options when initializing the model.
 
 For pooling models, we support the following {code}`task` options:
 
@@ -42,20 +42,46 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ### `LLM.encode`
 
 The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
-It returns the aggregated hidden states directly.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+```python
+llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+(output,) = llm.encode("Hello, my name is")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
+### `LLM.embed`
+
+The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
 
 ```python
 llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-outputs = llm.encode("Hello, my name is")
+(output,) = llm.embed("Hello, my name is")
 
-outputs = model.encode(prompts)
-for output in outputs:
-    embeddings = output.outputs.embedding
-    print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}")
+embeds = output.outputs.embedding
+print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
 A code example can be found in [examples/offline_inference_embedding.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py).
 
+### `LLM.classify`
+
+The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+```python
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+(output,) = llm.classify("Hello, my name is")
+
+probs = output.outputs.probs
+print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+```
+
+A code example can be found in [examples/offline_inference_classification.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py).
+
 ### `LLM.score`
 
 The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
@@ -67,7 +93,16 @@ vLLM can only perform the model inference component (e.g. embedding, reranking)
 To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
 ```
 
-You can use [these tests](https://github.com/vllm-project/vllm/blob/main/tests/models/embedding/language/test_scoring.py) as reference.
+```python
+llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+(output,) = llm.score("What is the capital of France?",
+                      "The capital of Brazil is Brasilia.")
+
+score = output.outputs.score
+print(f"Score: {score}")
+```
+
+A code example can be found in [examples/offline_inference_scoring.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py).
 
 ## Online Inference
 
@@ -76,7 +111,7 @@ Please click on the above link for more details on how to launch the server.
 
 ### Embeddings API
 
-Our Embeddings API is similar to `LLM.encode`, accepting both text and [multi-modal inputs](#multimodal-inputs).
+Our Embeddings API is similar to `LLM.embed`, accepting both text and {ref}`multi-modal inputs <multimodal_inputs>`.
 
 The text-only API is compatible with [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
 so that you can use OpenAI client to interact with it.
@@ -84,9 +119,9 @@ A code example can be found in [examples/openai_embedding_client.py](https://git
 
 The multi-modal API is an extension of the [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
 that incorporates [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat),
-so it is not part of the OpenAI standard. Please see [this page](#multimodal-inputs) for more details on how to use it.
+so it is not part of the OpenAI standard. Please see {ref}`this page <multimodal_inputs>` for more details on how to use it.
 
 ### Score API
 
 Our Score API is similar to `LLM.score`.
-Please see [this page](#score-api-for-cross-encoder-models) for more details on how to use it.
+Please see [this page](../serving/openai_compatible_server.html#score-api-for-cross-encoder-models) for more details on how to use it.
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 59bc7d2606626..0308bbbca7caa 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -112,9 +112,9 @@ See [this page](#generative-models) for more information on how to use generativ
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
     - ✅︎
     - ✅︎
-  * - :code:`CohereForCausalLM`
+  * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM`
     - Command-R
-    - :code:`CohereForAI/c4ai-command-r-v01`, etc.
+    - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
     - ✅︎
     - ✅︎
   * - :code:`DbrxForCausalLM`
@@ -197,6 +197,11 @@ See [this page](#generative-models) for more information on how to use generativ
     - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
@@ -392,6 +397,11 @@ The following table lists those that are tested in vLLM.
     - :code:`BAAI/bge-multilingual-gemma2`, etc.
     -
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
     - Llama-based
     - :code:`intfloat/e5-mistral-7b-instruct`, etc.
@@ -656,10 +666,10 @@ See [this page](#generative-models) for more information on how to use generativ
     - ✅︎
     - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
-    - PaliGemma
+    - PaliGemma, PaliGemma 2
     - T + I\ :sup:`E`
-    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
-    -
+    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
+    - 
     - ✅︎
     -
   * - :code:`Phi3VForCausalLM`
diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md
index 2f7b779a9b361..8240eca1c7e03 100644
--- a/docs/source/quantization/bnb.md
+++ b/docs/source/quantization/bnb.md
@@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.
 
 ```console
-$ pip install bitsandbytes>=0.44.0
+$ pip install bitsandbytes>=0.45.0
 ```
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md
index 796e8810a444c..2ef04426b3096 100644
--- a/docs/source/serving/deploying_with_docker.md
+++ b/docs/source/serving/deploying_with_docker.md
@@ -33,6 +33,29 @@ current GPU type the machine is running on, you can add the argument `--build-ar
 for vLLM to find the current GPU type and build for that.
 ```
 
+## Building for Arm64/aarch64
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+```{note}
+Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+```
+
+```console
+# Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
+$ DOCKER_BUILDKIT=1 sudo docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t vllm/vllm-gh200-openai:latest \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+```
+
 To run vLLM:
 
 ```console
diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/serving/deploying_with_helm.md
index d10e7a344c994..3b26575827011 100644
--- a/docs/source/serving/deploying_with_helm.md
+++ b/docs/source/serving/deploying_with_helm.md
@@ -22,7 +22,7 @@ Before you begin, ensure that you have the following:
 To install the chart with the release name `test-vllm`:
 
 ```console
-helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3buckername=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
 ```
 
 ## Uninstalling the Chart
diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
index e52ba463b3777..d27db826cd006 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -157,7 +157,7 @@ To test the deployment, run the following `curl` command:
 curl http://mistral-7b.default.svc.cluster.local/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-        "model": "facebook/opt-125m",
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
         "prompt": "San Francisco is a",
         "max_tokens": 7,
         "temperature": 0
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 5ab70a9aaaba6..e0485d66c0a26 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -51,7 +51,7 @@ $     --pipeline-parallel-size 2
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided a helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh) to start the cluster.
+The first step, is to start containers and organize them into a cluster. We have provided a helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh) to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 70c010dd2c76e..8712d82ca075e 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -1,13 +1,13 @@
 # OpenAI Compatible Server
 
-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
 
 You can start the server using Python, or using [Docker](deploying_with_docker.md):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
-To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
+To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
 ```python
 from openai import OpenAI
 client = OpenAI(
@@ -25,168 +25,71 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```
 
-## API Reference
+## Supported APIs
 
 We currently support the following OpenAI APIs:
 
-- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+- [Completions API](#completions-api) (`/v1/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`).
   - *Note: `suffix` parameter is not supported.*
-- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.md).
-    - *Note: `image_url.detail` parameter is not supported.*
-  - We also support `audio_url` content type for audio files.
-    - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
-    - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
+- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
-- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
-  - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
-    which will be treated as a single prompt to the model according to its chat template.
-    - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details.
-  - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
-
-(score-api-for-cross-encoder-models)=
-
-## Score API for Cross Encoder Models
+- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
+  - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
 
-vLLM supports *cross encoders models* at the **/v1/score** endpoint, which is not an OpenAI API standard endpoint. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+In addition, we have the following custom APIs:
 
-A ***Cross Encoder*** takes exactly two sentences / texts as input and either predicts a score or label for this sentence pair. It can for example predict the similarity of the sentence pair on a scale of 0 … 1.
+- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
+  - Applicable to any model with a tokenizer.
+- [Score API](#score-api) (`/score`)
+  - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`).
 
-### Example of usage for a pair of a string and a list of texts
+(chat-template)=
+## Chat Template
 
-In this case, the model will compare the first given text to each of the texts containing the list.
+In order for the language model to support chat protocol, vLLM requires the model to include
+a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
+specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
 
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "text_1": "What is the capital of France?",
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
-  ]
-}'
-```
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
 
-Response:
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
+template, or the template in string form. Without a chat template, the server will not be able to process chat
+and all chat requests will error.
 
 ```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693570,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        0.001094818115234375
-      ]
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
-### Example of usage for a pair of two lists of texts
-
-In this case, the model will compare the one by one, making pairs by same index correspondent in each list.
+vLLM community provides a set of chat templates for popular models. You can find them in the examples
+directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "encoding_format": "float",
-  "text_1": [
-    "What is the capital of Brazil?",
-    "What is the capital of France?"
-  ],
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
+With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
+both a `type` and a `text` field. An example is provided below:
+```python
+completion = client.chat.completions.create(
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
+  messages=[
+    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
   ]
-}'
-```
-
-Response:
-
-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        1
-      ]
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
+)
 ```
 
-### Example of usage for a pair of two strings
-
-In this case, the model will compare the strings of texts.
-
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "encoding_format": "float",
-  "text_1": "What is the capital of France?",
-  "text_2": "The capital of France is Paris."
-}'
-```
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
+`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
+request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
+*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
+the detected format, which can be one of:
 
-Response:
+- `"string"`: A string.
+  - Example: `"Hello world"`
+- `"openai"`: A list of dictionaries, similar to OpenAI schema.
+  - Example: `[{"type": "text", "text": "Hello world!"}]`
 
-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
-```
+If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
+to override which format to use.
 
 ## Extra Parameters
 
@@ -206,7 +109,7 @@ completion = client.chat.completions.create(
 )
 ```
 
-### Extra HTTP Headers
+## Extra HTTP Headers
 
 Only `X-Request-Id` HTTP request header is supported for now.
 
@@ -232,9 +135,55 @@ completion = client.completions.create(
 print(completion._request_id)
 ```
 
-### Extra Parameters for Completions API
+## CLI Reference
+
+(vllm-serve)=
+### `vllm serve`
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+```{argparse}
+:module: vllm.entrypoints.openai.cli_args
+:func: create_parser_for_docs
+:prog: vllm serve
+```
+
+#### Configuration file
+
+You can load CLI arguments via a [YAML](https://yaml.org/) config file.
+The argument names must be the long form of those outlined [above](#vllm-serve).
+
+For example:
+
+```yaml
+# config.yaml
+
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+To use the above config file:
+
+```bash
+$ vllm serve SOME_MODEL --config config.yaml
+```
+
+```{note}
+In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
+The order of priorities is `command line > config file values > defaults`.
+```
+
+## API Reference
+
+(completions-api)=
+### Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details.
+
+#### Extra parameters
+
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -250,9 +199,19 @@ The following extra parameters are supported:
 :end-before: end-completion-extra-params
 ```
 
-### Extra Parameters for Chat Completions API
+(chat-api)=
+### Chat Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
+[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
+see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+- *Note: `image_url.detail` parameter is not supported.*
+
+#### Extra parameters
+
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -268,9 +227,21 @@ The following extra parameters are supported:
 :end-before: end-chat-completion-extra-params
 ```
 
-### Extra Parameters for Embeddings API
+(embeddings-api)=
+### Embeddings API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details.
+
+If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api))
+which will be treated as a single prompt to the model.
+
+```{tip}
+This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
+```
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+#### Extra parameters
+
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -278,7 +249,7 @@ The following [pooling parameters (click through to see documentation)](../dev/p
 :end-before: end-embedding-pooling-params
 ```
 
-The following extra parameters are supported:
+The following extra parameters are supported by default:
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -286,80 +257,179 @@ The following extra parameters are supported:
 :end-before: end-embedding-extra-params
 ```
 
-## Chat Template
+For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-In order for the language model to support chat protocol, vLLM requires the model to include
-a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-chat-embedding-extra-params
+:end-before: end-chat-embedding-extra-params
+```
 
-An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
+(tokenizer-api)=
+### Tokenizer API
 
-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
-you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
-template, or the template in string form. Without a chat template, the server will not be able to process chat
-and all chat requests will error.
+The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
+It consists of two endpoints:
+
+- `/tokenize` corresponds to calling `tokenizer.encode()`.
+- `/detokenize` corresponds to calling `tokenizer.decode()`.
+
+(score-api)=
+### Score API
+
+The Score API applies a cross-encoder model to predict scores for sentence pairs.
+Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
+
+You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+#### Single inference
+
+You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
+
+Request:
 
 ```bash
-vllm serve <model> --chat-template ./path-to-chat-template.jinja
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": "What is the capital of France?",
+  "text_2": "The capital of France is Paris."
+}'
 ```
 
-vLLM community provides a set of chat templates for popular models. You can find them in the examples
-directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+Response:
 
-With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
-both a `type` and a `text` field. An example is provided below:
-```python
-completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
-  ]
-)
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
 ```
 
-Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
-`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
-request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
-*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
-the detected format, which can be one of:
+#### Batch inference
 
-- `"string"`: A string.
-  - Example: `"Hello world"`
-- `"openai"`: A list of dictionaries, similar to OpenAI schema.
-  - Example: `[{"type": "text", "text": "Hello world!"}]`
+You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs
+where each pair is built from `text_1` and a string in `text_2`.
+The total number of pairs is `len(text_2)`.
 
-If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
-to override which format to use.
+Request:
 
-## Command line arguments for the server
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "text_1": "What is the capital of France?",
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
+```
 
-```{argparse}
-:module: vllm.entrypoints.openai.cli_args
-:func: create_parser_for_docs
-:prog: vllm serve
+Response:
+
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693570,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 0.001094818115234375
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
 ```
 
+You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
+where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
+The total number of pairs is `len(text_2)`.
+
+Request:
+
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": [
+    "What is the capital of Brazil?",
+    "What is the capital of France?"
+  ],
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
+```
 
-### Config file
+Response:
 
-The `serve` module can also accept arguments from a config file in
-`yaml` format. The arguments in the yaml must be specified using the
-long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference):
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
+```
 
-For example:
+#### Extra parameters
 
-```yaml
-# config.yaml
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
-host: "127.0.0.1"
-port: 6379
-uvicorn-log-level: "info"
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-pooling-params
+:end-before: end-score-pooling-params
 ```
 
-```bash
-$ vllm serve SOME_MODEL --config config.yaml
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-extra-params
+:end-before: end-score-extra-params
 ```
----
-**NOTE**
-In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence.
-The order of priorities is `command line > config file values > defaults`.
diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/serving/serving_with_llamastack.md
index d31ea0be38e41..71dadca7ad47c 100644
--- a/docs/source/serving/serving_with_llamastack.md
+++ b/docs/source/serving/serving_with_llamastack.md
@@ -22,7 +22,7 @@ inference:
       url: http://127.0.0.1:8000
 ```
 
-Please refer to [this guide](https://github.com/meta-llama/llama-stack/blob/main/docs/source/getting_started/distributions/self_hosted_distro/remote_vllm.md) for more details on this remote vLLM provider.
+Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
 
 ## Inference via Embedded vLLM
 
diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/usage/disagg_prefill.md
new file mode 100644
index 0000000000000..a61c00fad1e3c
--- /dev/null
+++ b/docs/source/usage/disagg_prefill.md
@@ -0,0 +1,64 @@
+(disagg-prefill)=
+
+# Disaggregated prefilling (experimental)
+
+This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
+
+## Why disaggregated prefilling?
+
+Two main reasons:
+
+- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
+- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
+
+```{note}
+Disaggregated prefill DOES NOT improve throughput.
+```
+
+## Usage example
+
+Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+
+## Benchmarks
+
+Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
+
+## Development
+
+We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
+
+All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`.
+
+Key abstractions for disaggregated prefilling:
+
+- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
+- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
+- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
+
+```{note}
+`insert` is non-blocking operation but `drop_select` is blocking operation.
+```
+
+Here is a figure illustrating how the above 3 abstractions are organized:
+
+```{image} /assets/usage/disagg_prefill/abstraction.jpg
+:alt: Disaggregated prefilling abstractions
+```
+
+The workflow of disaggregated prefilling is as follows:
+
+```{image} /assets/usage/disagg_prefill/overview.jpg
+:alt: Disaggregated prefilling workflow
+```
+
+The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
+
+## Third-party contributions
+
+Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
+
+We recommend three ways of implementations:
+
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
+- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md
index ba307f6684e08..b0c887398b1b7 100644
--- a/docs/source/usage/multimodal_inputs.md
+++ b/docs/source/usage/multimodal_inputs.md
@@ -309,7 +309,95 @@ $ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 
 ### Audio
 
-Instead of {code}`image_url`, you can pass an audio file via {code}`audio_url`.
+Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
+Here is a simple example using Ultravox-v0.3.
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve fixie-ai/ultravox-v0_3
+```
+
+Then, you can use the OpenAI client as follows:
+
+```python
+import base64
+import requests
+from openai import OpenAI
+from vllm.assets.audio import AudioAsset
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+# Any format supported by librosa is supported
+audio_url = AudioAsset("winning_call").url
+audio_base64 = encode_base64_content_from_url(audio_url)
+
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": audio_base64,
+                    "format": "wav"
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print("Chat completion output from input audio:", result)
+```
+
+Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input:
+
+```python
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output from audio url:", result)
+```
 
 A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py).
 
@@ -338,12 +426,12 @@ Refer to the examples below for illustration.
 Here is an end-to-end example using VLM2Vec. To serve the model:
 
 ```bash
-vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
   --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 ```
 
 ```{important}
-Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embedding`
+Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
 to run this model in embedding mode instead of text generation mode.
 
 The custom chat template is completely different from the original one for this model,
@@ -379,12 +467,12 @@ print("Embedding output:", response_json["data"][0]["embedding"])
 Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model.
 
 ```bash
-vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
+vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
   --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 ```
 
 ```{important}
-Like with VLM2Vec, we have to explicitly pass `--task embedding`.
+Like with VLM2Vec, we have to explicitly pass `--task embed`.
 
 Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
 by [this custom chat template](https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja).
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 050b791b62adb..68b786961b14a 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -25,16 +25,16 @@ def run_ultravox(question: str, audio_count: int):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
-        'role':
-        'user',
-        'content':
-        "<|reserved_special_token_0|>\n" * audio_count + question
+        'role': 'user',
+        'content': "<|audio|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference_classification.py
new file mode 100644
index 0000000000000..de539b639a196
--- /dev/null
+++ b/examples/offline_inference_classification.py
@@ -0,0 +1,28 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM.
+# You should pass task="classify" for classification models
+model = LLM(
+    model="jason9693/Qwen2.5-1.5B-apeach",
+    task="classify",
+    enforce_eager=True,
+)
+
+# Generate logits. The output is a list of ClassificationRequestOutputs.
+outputs = model.classify(prompts)
+
+# Print the outputs.
+for prompt, output in zip(prompts, outputs):
+    probs = output.outputs.probs
+    probs_trimmed = ((str(probs[:16])[:-1] +
+                      ", ...]") if len(probs) > 16 else probs)
+    print(f"Prompt: {prompt!r} | "
+          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index 17f6d992073d7..58d004313ad51 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -9,14 +9,20 @@
 ]
 
 # Create an LLM.
+# You should pass task="embed" for embedding models
 model = LLM(
     model="intfloat/e5-mistral-7b-instruct",
-    task="embed",  # You should pass task="embed" for embedding models
+    task="embed",
     enforce_eager=True,
 )
 
-# Generate embedding. The output is a list of PoolingRequestOutputs.
-outputs = model.encode(prompts)
+# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+outputs = model.embed(prompts)
+
 # Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 4096 floats
+for prompt, output in zip(prompts, outputs):
+    embeds = output.outputs.embedding
+    embeds_trimmed = ((str(embeds[:16])[:-1] +
+                       ", ...]") if len(embeds) > 16 else embeds)
+    print(f"Prompt: {prompt!r} | "
+          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md
index 4c64197975534..2436417cb543a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
@@ -1,45 +1,48 @@
 # Offline Inference with the OpenAI Batch file format
 
- **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API.
- 
- ## File Format
- 
- The OpenAI batch file format consists of a series of json objects on new lines.
+```{important}
+This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
+```
+
+## File Format
  
- [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+The OpenAI batch file format consists of a series of json objects on new lines.
  
- Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
  
- **NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
- ## Pre-requisites
+```{note}
+We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+```
  
-* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`.
+## Pre-requisites
+
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
   - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
   - Install the token on your machine (Run `huggingface-cli login`).
   - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
  
  
- ## Example 1: Running with a local file
- 
- ### Step 1: Create your batch file
- 
- To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+## Example 1: Running with a local file
+
+### Step 1: Create your batch file
+
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
- 
- ### Step 2: Run the batch
+```
+
+### Step 2: Run the batch
  
 The batch running tool is designed to be used from the command line.
 
@@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 ### Step 1: Upload your input script
 
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
+```
 
 Now upload your batch file to your S3 bucket.
 
@@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket.
 aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
-  
 ### Step 2: Generate your presigned urls
 
 Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
@@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 
 ### Step 1: Create your batch file
  
- Add embedding requests to your batch file. The following is an example:
+Add embedding requests to your batch file. The following is an example:
  
- ```
- {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
 ```
- 
- You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
+You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
- ### Step 2: Run the batch
+### Step 2: Run the batch
 
 You can run the batch using the same command as in earlier examples.
 
-
 ### Step 3: Check your results
 
 You can check your results by running `cat results.jsonl`
@@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl`
 ```
 $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
-...```
+...
 ```
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference_scoring.py
new file mode 100644
index 0000000000000..5da9e710959b5
--- /dev/null
+++ b/examples/offline_inference_scoring.py
@@ -0,0 +1,23 @@
+from vllm import LLM
+
+# Sample prompts.
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+]
+
+# Create an LLM.
+# You should pass task="score" for cross-encoder models
+model = LLM(
+    model="BAAI/bge-reranker-v2-m3",
+    task="score",
+    enforce_eager=True,
+)
+
+# Generate scores. The output is a list of ScoringRequestOutputs.
+outputs = model.score(text_1, texts_2)
+
+# Print the outputs.
+for text_2, output in zip(texts_2, outputs):
+    score = output.outputs.score
+    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index c6a274ee5894b..6d0495fdd4054 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -5,6 +5,8 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import random
+
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
@@ -17,13 +19,168 @@
 # Unless specified, these settings have been tested to work on a single L4.
 
 
+# Aria
+def run_aria(question: str, modality: str):
+    assert modality == "image"
+    model_name = "rhymes-ai/Aria"
+
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+              "<|im_end|>\n<|im_start|>assistant\n")
+
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return llm, prompt, stop_token_ids
+
+
+# BLIP-2
+def run_blip2(question: str, modality: str):
+    assert modality == "image"
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Chameleon
+def run_chameleon(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Fuyu
+def run_fuyu(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b",
+              max_model_len=2048,
+              max_num_seqs=2,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# GLM-4v
+def run_glm4v(question: str, modality: str):
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
+
+    llm = LLM(model=model_name,
+              max_model_len=2048,
+              max_num_seqs=2,
+              trust_remote_code=True,
+              enforce_eager=True,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    prompt = question
+    stop_token_ids = [151329, 151336, 151338]
+    return llm, prompt, stop_token_ids
+
+
+# H2OVL-Mississippi
+def run_h2ovl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return llm, prompt, stop_token_ids
+
+
+# Idefics3-8B-Llama3
+def run_idefics3(question: str, modality: str):
+    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = (
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    )
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# InternVL
+def run_internvl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "OpenGVLab/InternVL2-2B"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return llm, prompt, stop_token_ids
+
+
 # LLaVA-1.5
 def run_llava(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -33,7 +190,9 @@ def run_llava_next(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -44,7 +203,9 @@ def run_llava_next_video(question: str, modality: str):
     assert modality == "video"
 
     prompt = f"USER: <video>\n{question} ASSISTANT:"
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -61,75 +222,26 @@ def run_llava_onevision(question: str, modality: str):
         <|im_start|>assistant\n"
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=16384)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Fuyu
-def run_fuyu(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
+              max_model_len=16384,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Phi-3-Vision
-def run_phi3v(question: str, modality: str):
+# Mantis
+def run_mantis(question: str, modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (128k) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # In this example, we override max_num_seqs to 5 while
-    # keeping the original context length of 128k.
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
 
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model="microsoft/Phi-3-vision-128k-instruct",
-        trust_remote_code=True,
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
-        max_num_seqs=2,
-        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={"num_crops": 16},
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# PaliGemma
-def run_paligemma(question: str, modality: str):
-    assert modality == "image"
-
-    # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224")
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Chameleon
-def run_chameleon(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
-    stop_token_ids = None
+    stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
 
 
@@ -154,6 +266,7 @@ def run_minicpmv(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -176,56 +289,45 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+# LLama 3.2
+def run_mllama(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
 
+    # The configuration below has been confirmed to launch on a single L40 GPU.
     llm = LLM(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
-    stop_token_ids = [tokenizer.eos_token_id]
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# InternVL
-def run_internvl(question: str, modality: str):
+# Molmo
+def run_molmo(question, modality):
     assert modality == "image"
 
-    model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "allenai/Molmo-7B-D-0924"
 
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
-        max_model_len=4096,
+        dtype="bfloat16",
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    prompt = question
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
@@ -241,6 +343,7 @@ def run_nvlm_d(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -253,55 +356,57 @@ def run_nvlm_d(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# BLIP-2
-def run_blip2(question: str, modality: str):
+# PaliGemma
+def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
-    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
-    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
-    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+    # PaliGemma has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma-3b-mix-224",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen
-def run_qwen_vl(question: str, modality: str):
+# PaliGemma 2
+def run_paligemma2(question: str, modality: str):
     assert modality == "image"
 
-    llm = LLM(
-        model="Qwen/Qwen-VL",
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-    )
-
-    prompt = f"{question}Picture 1: <img></img>\n"
+    # PaliGemma 2 has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+# Phi-3-Vision
+def run_phi3v(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model=model_name,
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },
+        mm_processor_kwargs={"num_crops": 16},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -315,6 +420,7 @@ def run_pixtral_hf(question: str, modality: str):
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"<s>[INST]{question}\n[IMG][/INST]"
@@ -322,142 +428,73 @@ def run_pixtral_hf(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# LLama 3.2
-def run_mllama(question: str, modality: str):
-    assert modality == "image"
-
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (131072) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-    )
-
-    prompt = f"<|image|><|begin_of_text|>{question}"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Molmo
-def run_molmo(question, modality):
+# Qwen
+def run_qwen_vl(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "allenai/Molmo-7B-D-0924"
-
     llm = LLM(
-        model=model_name,
+        model="Qwen/Qwen-VL",
         trust_remote_code=True,
-        dtype="bfloat16",
+        max_model_len=1024,
+        max_num_seqs=2,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    prompt = question
+    prompt = f"{question}Picture 1: <img></img>\n"
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# GLM-4v
-def run_glm4v(question: str, modality: str):
+# Qwen2-VL
+def run_qwen2_vl(question: str, modality: str):
     assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
-
-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True)
-    prompt = question
-    stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
-
 
-# Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
-    assert modality == "image"
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     llm = LLM(
         model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        enforce_eager=True,
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        max_model_len=4096,
+        max_num_seqs=5,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={
-            "size": {
-                "longest_edge": 3 * 364
-            },
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
         },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Aria
-def run_aria(question: str, modality: str):
-    assert modality == "image"
-    model_name = "rhymes-ai/Aria"
-
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16")
-
-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
-              "<|im_end|>\n<|im_start|>assistant\n")
-
-    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompt, stop_token_ids
-
-
-# Mantis
-def run_mantis(question: str, modality: str):
-    assert modality == "image"
-
-    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompt = llama3_template.format(f"{question}\n<image>")
 
-    llm = LLM(
-        model="TIGER-Lab/Mantis-8B-siglip-llama3",
-        max_model_len=4096,
-        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-    )
-    stop_token_ids = [128009]
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
 model_example_map = {
+    "aria": run_aria,
+    "blip-2": run_blip2,
+    "chameleon": run_chameleon,
+    "fuyu": run_fuyu,
+    "glm4v": run_glm4v,
+    "h2ovl_chat": run_h2ovl,
+    "idefics3": run_idefics3,
+    "internvl_chat": run_internvl,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
     "llava-onevision": run_llava_onevision,
-    "fuyu": run_fuyu,
-    "phi3_v": run_phi3v,
-    "paligemma": run_paligemma,
-    "chameleon": run_chameleon,
+    "mantis": run_mantis,
     "minicpmv": run_minicpmv,
-    "blip-2": run_blip2,
-    "h2ovl_chat": run_h2ovl,
-    "internvl_chat": run_internvl,
+    "mllama": run_mllama,
+    "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
+    "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
+    "phi3_v": run_phi3v,
+    "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
-    "pixtral_hf": run_pixtral_hf,
-    "mllama": run_mllama,
-    "molmo": run_molmo,
-    "glm4v": run_glm4v,
-    "idefics3": run_idefics3,
-    "aria": run_aria,
-    "mantis": run_mantis,
 }
 
 
@@ -494,6 +531,35 @@ def get_multi_modal_input(args):
     raise ValueError(msg)
 
 
+def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -524,14 +590,29 @@ def main(args):
 
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompt,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompt,
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for _ in range(args.num_prompts)]
+
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
@@ -561,5 +642,23 @@ def main(args):
                         type=int,
                         default=16,
                         help='Number of frames to extract from the video.')
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--mm-cache-preprocessor',
+        action='store_true',
+        help='If True, enable caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
     args = parser.parse_args()
     main(args)
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index bf466109f0981..4ce3d496bf45b 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -133,7 +133,7 @@ def run_encode(model: str, modality: QueryModality):
     if req_data.image is not None:
         mm_data["image"] = req_data.image
 
-    outputs = req_data.llm.encode({
+    outputs = req_data.llm.embed({
         "prompt": req_data.prompt,
         "multi_modal_data": mm_data,
     })
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 928bbef54eab7..6af8d7768e75d 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -33,78 +33,23 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
-    model_name = "Qwen/Qwen-VL-Chat"
-    llm = LLM(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-    placeholders = "".join(f"Picture {i}: <img></img>\n"
-                           for i, _ in enumerate(image_urls, start=1))
-
-    # This model does not have a chat_template attribute on its tokenizer,
-    # so we need to explicitly pass it. We use ChatML since it's used in the
-    # generation utils of the model:
-    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-
-    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
-    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
-
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=chat_template)
-
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=stop_token_ids,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=chat_template,
-    )
-
-
-def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,
-        max_model_len=4096,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        mm_processor_kwargs={"num_crops": 4},
-    )
-    placeholders = "\n".join(f"<|image_{i}|>"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
-    stop_token_ids = None
-
+def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "rhymes-ai/Aria"
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              limit_mm_per_prompt={"image": len(image_urls)})
+    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
+    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
+        chat_template=None)
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -141,6 +86,37 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 2 * 364
+            },
+        },
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -178,6 +154,28 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_nvlm_d(question: str, image_urls: List[str]):
     model_name = "nvidia/NVLM-D-72B"
 
@@ -211,6 +209,80 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
+    )
+    placeholders = "\n".join(f"<|image_{i}|>"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
+def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "Qwen/Qwen-VL-Chat"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=1024,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "".join(f"Picture {i}: <img></img>\n"
+                           for i, _ in enumerate(image_urls, start=1))
+
+    # This model does not have a chat_template attribute on its tokenizer,
+    # so we need to explicitly pass it. We use ChatML since it's used in the
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=chat_template)
+
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=chat_template,
+    )
+
+
 def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
@@ -268,88 +340,16 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
-def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
-        mm_processor_kwargs={
-            "size": {
-                "longest_edge": 2 * 364
-            },
-        },
-    )
-
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
-def load_aria(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "rhymes-ai/Aria"
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16",
-              limit_mm_per_prompt={"image": len(image_urls)})
-    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
-    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
-    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=stop_token_ids,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None)
-
-
 model_example_map = {
-    "phi3_v": load_phi3v,
+    "aria": load_aria,
     "h2ovl_chat": load_h2onvl,
+    "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
-    "qwen2_vl": load_qwen2_vl,
+    "phi3_v": load_phi3v,
     "qwen_vl_chat": load_qwenvl_chat,
-    "mllama": load_mllama,
-    "idefics3": load_idefics3,
-    "aria": load_aria,
+    "qwen2_vl": load_qwen2_vl,
 }
 
 
diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py
index 1f00d26808771..abcfa8e8f2f2a 100644
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 from vllm import LLM, SamplingParams
 
@@ -15,19 +16,25 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+if __name__ == "__main__":
 
-llm.start_profile()
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    llm.start_profile()
 
-llm.stop_profile()
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
 
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    llm.stop_profile()
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
diff --git a/examples/offline_profile.py b/examples/offline_profile.py
index 1d415b82cddb6..46afe8aa2604b 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_profile.py
@@ -4,9 +4,10 @@
 import sys
 from argparse import RawTextHelpFormatter
 from dataclasses import asdict, dataclass
-from typing import Optional
+from typing import Any, Dict, Generator, List, Optional, TypeAlias
 
 import torch
+import tqdm
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -15,16 +16,21 @@
 
 BATCH_SIZE_DEFAULT = 1
 PROMPT_LEN_DEFAULT = 256
-OUTPUT_LEN_DEFAULT = 2
 
 
 @dataclass
 class ProfileContext:
     engine_args: EngineArgs
     prompt_len: int
-    output_len: int
     batch_size: int
-    save_chrome_traces_folder: Optional[str]
+
+    # The profiler can run in 2 modes,
+    # 1. Run profiler for user specified num_steps
+    num_steps: Optional[int] = None
+    # 2. Run profiler until all requests complete
+    complete_num_requests_per_step: Optional[int] = None
+
+    save_chrome_traces_folder: Optional[str] = None
 
 
 def get_dtype(dtype: str):
@@ -34,23 +40,155 @@ def get_dtype(dtype: str):
         return dtype
 
 
+OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+      -> OutputLen_NumReqs_Map:
+    """
+    Given the number of requests, batch_size, and the number of requests
+    that each engine-step should process, step_requests, determine the
+    output lengths of the requests such that step_request is honoured.
+
+    Example: 
+    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
+    then return,
+    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
+    32 requests should have output length 2,
+    32 requests should have output length 3,
+    32 requests should have output length 4,
+    31 requests should have output length 5,
+    1 request should have output length 6.
+
+    Args:
+        batch_size (int): Number of requests submitted for profile. This is
+            args.batch_size.
+        step_requests (List[int]): step_requests[i] is the number of requests
+            that the ith engine step should process.
+
+    Returns:
+        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
+            number of requests required to have that output-length as values.
+    """
+    ol_nr: OutputLen_NumReqs_Map = {}
+
+    # Number of request that are assigned an output-length
+    num_reqs_assigned: int = 0
+    num_steps: int = len(step_requests)
+
+    # sanity check. The first step (prefill-step), must process all requests.
+    assert step_requests[0] == batch_size
+
+    # Begin assignments from the last step.
+    output_length: int = num_steps
+    for num_requests_at_step in reversed(step_requests):
+        if num_reqs_assigned == batch_size:
+            break
+
+        assert num_reqs_assigned < batch_size
+
+        # Remove the number of requests that have been determined
+        # to participate in this step and beyond.
+        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
+        assert num_reqs_unassigned_at_step >= 0
+
+        if num_reqs_unassigned_at_step > 0:
+            ol_nr[output_length] = num_reqs_unassigned_at_step
+            num_reqs_assigned += num_reqs_unassigned_at_step
+
+        output_length -= 1
+
+    # sanity checks.
+    assert sum(ol_nr.values()) == batch_size, \
+            ("Number of requests in output-length assignment does not match "
+             f"batch-size.\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    # Check that the output-length is in [1, num-steps]. Output length must be
+    # at least 1 as all requests must participate in the prefill-step.
+    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
+            ("Output lengths of requests should be in range "
+             f"[1, num-engine-steps].\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    return ol_nr
+
+
+def determine_requests_per_step(context: ProfileContext) -> List[int]:
+    """
+    Determine number of requests each engine step should process.
+    If context.num_steps is set, then all engine steps process the
+    same number of requests and the output list is of length
+    context.num_steps.
+
+    If context.complete_num_requests_per_step is set, then each decode step
+    processes fewer and fewer requests until there are no requests to process.
+    In this case, the output list is as big as the number of steps
+    required to process all requests.
+
+    Args:
+        context: ProfileContext object.
+
+    Returns:
+        List[int]: Number of requests to process for all engine-steps. 
+         output[i], contains the number of requests that the ith step
+         should process.
+    """
+    if context.num_steps:
+        # All requests must run until num_engine_steps. This implies
+        # that their output lengths must be equal to num_engine_steps.
+        return [context.batch_size] * context.num_steps
+
+    assert context.complete_num_requests_per_step and \
+                context.complete_num_requests_per_step > 0, \
+        (f"Expected a positive complete_num_requests_per_step argument."
+         f"Instead got {context.complete_num_requests_per_step}")
+
+    # We start dropping after the first decode step.
+    step_requests = [
+        context.batch_size,  # prefill
+        context.batch_size,  # decode
+    ]
+
+    num_running_requests = context.batch_size
+    num_running_requests -= context.complete_num_requests_per_step
+    while num_running_requests > 0:
+        step_requests.append(num_running_requests)
+        num_running_requests -= context.complete_num_requests_per_step
+
+    if step_requests[-1] != 1:
+        # have 1 request running at the last step. This is often
+        # useful
+        step_requests.append(1)
+
+    return step_requests
+
+
 def run_profile(context: ProfileContext, csv_output: Optional[str],
                 json_output: Optional[str]):
     print("Run profile with:")
     for key, value in asdict(context).items():
         print(f"  {key} = {value}")
 
+    requests_per_step: List[int] = determine_requests_per_step(context)
+
+    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
+        context.batch_size, requests_per_step)
+
+    num_steps_to_profile: int = len(requests_per_step)
+    max_output_len: int = max(ol_nr.keys())
+    assert max_output_len >= 1
+
     # Create sampling params
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=args.output_len,
-                                     ignore_eos=True)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        # max_tokens is set on a per-request basis.
+        max_tokens=None,
+        ignore_eos=True)
 
     # Create LLM
     llm = LLM(**asdict(context.engine_args))
     batch_size = context.batch_size
     prompt_len = context.prompt_len
-    output_len = context.output_len
 
     scheduler_config = llm.llm_engine.scheduler_config
     max_model_len = llm.llm_engine.model_config.max_model_len
@@ -65,7 +203,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
               f"choose a smaller batch size or prompt length, or increase "
               f"--max-num-batched-tokens")
         sys.exit(-1)
-    if batch_size >= max_num_seqs:
+    if batch_size > max_num_seqs:
         print(
             f"ERROR: chosen batch_size ({batch_size}) is larger than "
             f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
@@ -73,16 +211,26 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
         sys.exit(-1)
     print("llm.llm_engine.model_config.max_model_len: ",
           llm.llm_engine.model_config.max_model_len)
-    if prompt_len + output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + output_len ({prompt_len} + "
-            f"{output_len} = {prompt_len + output_len}) is larger than the "
-            f"model's max_model_len ({max_model_len}), please choose a smaller "
-            f"prompt_len or output_len, or increase --max-model-len")
+    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
+        print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
+              f"{max_output_len} = {prompt_len + max_output_len}) is larger "
+              f"than the model's max_model_len ({max_model_len}), please "
+              f"choose a smaller prompt_len or max_output_len, or increase "
+              f"--max-model-len")
         sys.exit(-1)
 
     def add_requests():
+
+        def get_output_len_generator() -> Generator[int, Any, Any]:
+            for output_len, num_reqs in ol_nr.items():
+                for _ in range(num_reqs):
+                    yield output_len
+
+        output_len_generator = get_output_len_generator()
         for i in range(batch_size):
+            sampling_params.max_tokens = next(output_len_generator)
+            assert isinstance(sampling_params.max_tokens, int)
+
             prompt_token_ids = torch.randint(
                 llm.llm_engine.model_config.get_vocab_size(),
                 size=(prompt_len, )).tolist()
@@ -110,8 +258,11 @@ def abort_requests():
         llm.llm_engine.step()  # First step is prefill
 
     decode_profs = []
-    for x in range(args.output_len - 1):
-        with layerwise_profile() as decode_prof:
+    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
+        num_running_seqs = llm.llm_engine.scheduler[
+            0].get_num_unfinished_seq_groups()
+        with layerwise_profile(
+                num_running_seqs=num_running_seqs) as decode_prof:
             llm.llm_engine.step()
         decode_profs.append(decode_prof)
 
@@ -154,7 +305,8 @@ def abort_requests():
         decode_results_list[0].print_summary_table()
 
     if csv_output:
-        csv_filename_base = csv_output.rstrip(".csv")
+        csv_filename_base = csv_output[:-4] \
+                if csv_output.endswith('.csv') else csv_output
         prefill_results.export_model_stats_table_csv(
             csv_filename_base + "_prefill_model_table.csv")
         prefill_results.export_summary_stats_table_csv(
@@ -187,10 +339,10 @@ def abort_requests():
             for idx, dr in enumerate(decode_results_list):
                 json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
 
-        for idx, dr in enumerate(decode_results_list[1:]):
-            json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        with open(json_output.rstrip(".json") + ".json", "w+") as f:
+        # Add .json to json_output filename if it doesn't exist already.
+        json_output_file = json_output if json_output.endswith(
+            '.json') else json_output + '.json'
+        with open(json_output_file, "w+") as f:
             json.dump(json_dict, f, indent=2)
         pass
 
@@ -214,7 +366,7 @@ def abort_requests():
     python examples/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager
+        --enforce-eager run_num_steps -n 2
     ```
 
     then you can use various tools to analyze the json output
@@ -261,17 +413,41 @@ def abort_requests():
                         default=BATCH_SIZE_DEFAULT,
                         help=f"Number of requests to run as a single batch, "
                         f"default={BATCH_SIZE_DEFAULT}")
-    parser.add_argument(
-        "--output-len",
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    run_num_steps_parser = subparsers.add_parser(
+        "run_num_steps",
+        help="This variation profiles n engine.step() invocations.")
+    run_num_steps_parser.add_argument(
+        '-n',
+        '--num-steps',
         type=int,
-        default=OUTPUT_LEN_DEFAULT,
-        help="Number of llm steps to run (includes prefill and decode) "
-        "- default={OUTPUT_LEN_DEFAULT}")
+        help="Number of engine steps to profile.\n"
+        "Setting it to 1, profiles only the prefill step.\n"
+        "Setting it to 2, profiles the prefill and first decode step\n"
+        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
+        "and so on ...")
+
+    run_to_completion_parser = subparsers.add_parser(
+        "run_to_completion",
+        help="This variation profiles all the engine.step() invocations"
+        "until the engine exhausts all submitted requests.")
+    run_to_completion_parser.add_argument(
+        '-n',
+        '--complete-num-requests-per-step',
+        type=int,
+        help=
+        "Complete complete_num_requests_per_step requests every decode step."
+        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
+        "the profiler is run for 6 engine steps, with the steps processing, "
+        "128, 128, 96, 64, 32, 1 requests respectively.\n"
+        "Note that we tack-on a one-request step at the end as it is often "
+        "useful.")
 
     EngineArgs.add_cli_args(parser)
 
     args = parser.parse_args()
-
     context = ProfileContext(
         engine_args=EngineArgs.from_cli_args(args),
         **{
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 0ec4f71dddf93..6a160fd70423f 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -153,10 +153,37 @@ def run_multi_image() -> None:
 
 # Audio input inference
 def run_audio() -> None:
-    # Any format supported by librosa is supported
     audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
 
-    # Use audio url in the payload
+    # HTTP URL
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role":
@@ -169,6 +196,7 @@ def run_audio() -> None:
                 {
                     "type": "audio_url",
                     "audio_url": {
+                        # Any format supported by librosa is supported
                         "url": audio_url
                     },
                 },
@@ -181,7 +209,7 @@ def run_audio() -> None:
     result = chat_completion_from_url.choices[0].message.content
     print("Chat completion output from audio url:", result)
 
-    audio_base64 = encode_base64_content_from_url(audio_url)
+    # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
index fff82020d9a30..a56e7429b7567 100644
--- a/examples/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -99,7 +99,7 @@ def dse_qwen2_vl(inp: dict):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embedding before running this.")
+        "the model with --task embed before running this.")
     parser.add_argument("model",
                         type=str,
                         choices=["vlm2vec", "dse_qwen2_vl"],
diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py
index 8c32eea5dd252..a06af8df5d3fe 100644
--- a/examples/openai_cross_encoder_score.py
+++ b/examples/openai_cross_encoder_score.py
@@ -1,14 +1,15 @@
-"""Examples Python client Score for Cross Encoder Models
 """
+Example online usage of Score API.
 
+Run `vllm serve <model> --task score` to start up the server in vLLM.
+"""
 import argparse
-import json
 import pprint
 
 import requests
 
 
-def post_http_request(prompt: json, api_url: str) -> requests.Response:
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     headers = {"User-Agent": "Test Client"}
     response = requests.post(api_url, headers=headers, json=prompt)
     return response
@@ -20,20 +21,29 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response:
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
     args = parser.parse_args()
-    api_url = f"http://{args.host}:{args.port}/v1/score"
+    api_url = f"http://{args.host}:{args.port}/score"
 
     model_name = args.model
 
+    text_1 = "What is the capital of Brazil?"
+    text_2 = "The capital of Brazil is Brasilia."
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Prompt when text_1 and text_2 are both strings:")
+    pprint.pprint(prompt)
+    print("Score Response:")
+    pprint.pprint(score_response.json())
+
     text_1 = "What is the capital of France?"
     text_2 = [
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 is string and text_2 is a list:")
+    print("Prompt when text_1 is string and text_2 is a list:")
     pprint.pprint(prompt)
     print("Score Response:")
-    pprint.pprint(score_response.data)
+    pprint.pprint(score_response.json())
 
     text_1 = [
         "What is the capital of Brazil?", "What is the capital of France?"
@@ -43,16 +53,7 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response:
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 and text_2 are lists:")
-    pprint.pprint(prompt)
-    print("Score Response:")
-    pprint.pprint(score_response.data)
-
-    text_1 = "What is the capital of Brazil?"
-    text_2 = "The capital of Brazil is Brasilia."
-    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
-    score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 and text_2 are strings:")
+    print("Prompt when text_1 and text_2 are both lists:")
     pprint.pprint(prompt)
     print("Score Response:")
-    pprint.pprint(score_response.data)
\ No newline at end of file
+    pprint.pprint(score_response.json())
diff --git a/pyproject.toml b/pyproject.toml
index 253b706a774a7..45fa4bff4e680 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ exclude = [
 ]
 
 [tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile"
+ignore-words-list = "dout, te, indicies, subtile, ElementE"
 skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 
 [tool.isort]
@@ -96,7 +96,8 @@ markers = [
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
     "quant_model: run this model test under Quantized category",
-    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
+    "split: run this test as part of a split",
+    "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
diff --git a/requirements-build.txt b/requirements-build.txt
index fec01caaf25ef..388b193403e88 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.5.1; platform_machine != 'aarch64'
 wheel
 jinja2
diff --git a/requirements-common.txt b/requirements-common.txt
index 792cd58e80669..1c935303c8d79 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -3,6 +3,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
 requests >= 2.26.0
 tqdm
+blake3
 py-cpuinfo
 transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.
@@ -13,12 +14,13 @@ aiohttp
 openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
-pillow  # Required for image processing
 prometheus_client >= 0.18.0
+pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.9
+outlines == 0.1.11
+lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
@@ -32,4 +34,5 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.0 # required for compressed-tensors
+compressed-tensors == 0.8.1 # required for compressed-tensors
+depyf==0.18.0 # required for profiling and debugging torch.compile
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index db8ad9d3a015d..e62f313297762 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,4 +4,5 @@
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
 torch==2.5.1; platform_machine == "aarch64"
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+datasets # for benchmark scripts
\ No newline at end of file
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
new file mode 100644
index 0000000000000..bbcb5cb7012ce
--- /dev/null
+++ b/requirements-cuda-arm64.txt
@@ -0,0 +1,3 @@
+--index-url https://download.pytorch.org/whl/nightly/cu124
+torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
+torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 058ab7c1ee9df..5d4dee8c7129a 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1
+torch == 2.5.1; platform_machine != 'aarch64'
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 17d40d0ee131a..f4fb89ef42834 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
diff --git a/requirements-test.in b/requirements-test.in
index c0b228148ab31..fb4179c3d8423 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -13,7 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 peft
-ray[adag]==2.35
+ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
@@ -25,7 +25,7 @@ datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
 # quantization
-bitsandbytes>=0.44.0
+bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
 
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index 8ceb705cdffd7..3771577fe8ed0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -37,7 +37,7 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements-test.in
-bitsandbytes==0.44.1
+bitsandbytes>=0.45.0
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
@@ -410,7 +410,7 @@ pyyaml==6.0.2
     #   ray
     #   timm
     #   transformers
-ray[adag]==2.35.0
+ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
     # via tensorizer
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index e41295792283f..42c6c321d040c 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -9,8 +9,8 @@ setuptools-scm>=8
 wheel
 jinja2
 
-torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu == 3.0.0b1
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 5036189077be2..ea3aaee9565ec 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -4,10 +4,10 @@
 import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import (FusionPass, find_auto_fn,
-                                     find_auto_fn_maybe)
+from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
+                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.reshapes import RedundantReshapesPass
-from vllm.compilation.vllm_inductor_pass import is_func
 from vllm.config import CompilationConfig
 
 from .backend import TestBackend
@@ -35,12 +35,16 @@
 ]
 
 
-@pytest.mark.parametrize("model",
-                         ["nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"])
+@pytest.mark.parametrize(
+    "model, quant_key",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
+      kFp8DynamicTokenSym)])
 @pytest.mark.parametrize("do_fusion", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fix_functionalization(model: str, do_fusion: bool):
+def test_fix_functionalization(model: str, quant_key: QuantKey,
+                               do_fusion: bool):
     torch.set_default_device("cuda")
 
     config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
@@ -78,8 +82,9 @@ def test_fix_functionalization(model: str, do_fusion: bool):
 
     # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
     # and replaced by fused quantized ops in RMS_QUANT_OPS.
-    ops = OPS_IN_MODEL + (RMS_QUANT_OPS["static_fp8"]
-                          if do_fusion else [RMS_OP])
+    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
+               ] if do_fusion else [RMS_OP]
+    ops = OPS_IN_MODEL + rms_ops
 
     for op in ops:
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index f92ec8d0de5f1..b4266a4a7db94 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -3,8 +3,9 @@
 from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
-from vllm.compilation.fusion import (FusionPass, find_auto_fn,
-                                     find_auto_fn_maybe)
+from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
+                                     FusionPass, QuantKey)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
 from vllm.compilation.reshapes import RedundantReshapesPass
 from vllm.config import CompilationConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -16,24 +17,37 @@
 
 class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, eps: float, *args, **kwargs):
+    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
-        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        if static:
+            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        else:
+            self.scale = [None for _ in range(2)]
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
 
     def forward(self, x):
-        resid = torch.relu(x)
+        resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1])
+        x2 = apply_fp8_linear(y,
+                              self.w[0],
+                              self.wscale[0],
+                              self.scale[0],
+                              use_per_token_if_dynamic=True)
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3])
+        x3 = apply_fp8_linear(y2,
+                              self.w[1],
+                              self.wscale[1],
+                              self.scale[1],
+                              use_per_token_if_dynamic=True)
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
@@ -42,14 +56,13 @@ def forward(self, x):
 @pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("static", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
     torch.set_default_device("cuda")
-    torch.set_default_dtype(torch.float16)
-
-    if eps != 1e-5:
-        pytest.skip("Only test eps=1e-5 for now")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(1)
 
     # Reshape pass is needed for the fusion pass to work
     config = CompilationConfig.PassConfig(enable_fusion=True,
@@ -58,7 +71,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
     fusion_pass = FusionPass.instance(config)
 
     backend = TestBackend(reshape_pass, fusion_pass)
-    model = TestModel(hidden_size, eps)
+    model = TestModel(hidden_size, eps, static)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size)
@@ -69,16 +82,28 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
     model2 = torch.compile(model, backend=backend)
     result2 = model2(x)
 
-    # Check that it gives the same answer
-    torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3)
+    # Higher tol for dynamic, even higher for bfloat16
+    if static:
+        ATOL, RTOL = (1e-3, 1e-3)
+    elif dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
 
     # Check substitution worked
     pre_nodes = backend.graph_pre_pass.nodes
     post_nodes = backend.graph_post_pass.nodes
 
-    rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default
-    add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+    # static is per-tensor, dynamic is per-token
+    key = QuantKey(dtype=FP8_DTYPE,
+                   static=static,
+                   per_tensor=static,
+                   symmetric=True)
+    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
+    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
+    fp8_quant = QUANT_OPS[key]
 
     # In pre-nodes, fp8 quant should be present and fused kernels should not
     assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
diff --git a/tests/conftest.py b/tests/conftest.py
index 7606e0f11dfeb..4e939221329cd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -719,14 +719,6 @@ def get_inputs(
 
         return inputs
 
-    def classify(self, prompts: List[str]) -> List[str]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
-
     def generate(
         self,
         prompts: List[str],
@@ -897,6 +889,10 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
+    def classify(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
     def encode(
         self,
         prompts: List[str],
@@ -909,16 +905,16 @@ def encode(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.encode(inputs)
+        req_outputs = self.model.embed(inputs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def score(
         self,
         text_1: Union[str, List[str]],
         text_2: Union[str, List[str]],
-    ) -> List[List[float]]:
+    ) -> List[float]:
         req_outputs = self.model.score(text_1, text_2)
-        return [req_output.outputs.embedding for req_output in req_outputs]
+        return [req_output.outputs.score for req_output in req_outputs]
 
     def __enter__(self):
         return self
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index bbeb4b3a58f2a..29ac3a3c86cb4 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from tests.core.utils import create_dummy_sequence
+from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block, BlockAllocator
 from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
@@ -801,6 +801,7 @@ def create_immutable_chain(
         block_size: int,
         token_ids: List[int],
         allocator: PrefixCachingBlockAllocator,
+        extra_hash: Optional[int] = None,
     ) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
@@ -816,7 +817,9 @@ def create_immutable_chain(
                                         block_size:(block_number + 1) *
                                         block_size]
             prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=block_token_ids)
+                prev_block=prev_block,
+                token_ids=block_token_ids,
+                extra_hash=extra_hash)
             blocks.append(prev_block)
 
         return blocks
@@ -931,3 +934,61 @@ def test_correct_block_hash():
         allocator.mark_blocks_as_computed([])
 
         assert tracker.get_num_cached_tokens(seq) == len(tokens)
+
+    @staticmethod
+    def test_correct_extra_hash():
+        """
+        Test that the block hash is correctly computed based on the extra hash,
+        ensuring it matches the allocator's block hash, specifically for the
+        LoRA case, and that the correct number of cached tokens is retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))
+
+        # Create a dummy LoRA sequence with a specific LoRA ID.
+        lora_seq = create_dummy_lora_sequence(request_id=0,
+                                              token_ids=tokens,
+                                              block_size=block_size,
+                                              lora_int_id=1)
+
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+            extra_hash=lora_seq.extra_hash(),
+        )
+
+        allocator.mark_blocks_as_computed([])
+
+        # Create different dummy sequences that have the same token IDs
+        # but different LoRA IDs.
+        seq = create_dummy_sequence(request_id=1,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+
+        different_lora_seq = create_dummy_lora_sequence(request_id=2,
+                                                        token_ids=tokens,
+                                                        block_size=block_size,
+                                                        lora_int_id=2)
+
+        # Due to the different LoRA IDs, corresponding blocks are not cached.
+        assert tracker.get_num_cached_tokens(seq) == 0
+        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
+
+        # The number of cached tokens matches the length of the tokens
+        # for the cached LoRA sequence.
+        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 277368b57b938..16703cd19fa1e 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -46,6 +46,16 @@ def create_dummy_prompt(
     return prompt, seq_group
 
 
+def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
+                               block_size: int, lora_int_id: int) -> Sequence:
+    return Sequence(seq_id=request_id,
+                    inputs=token_inputs(token_ids),
+                    block_size=block_size,
+                    lora_request=LoRARequest(lora_name="dummy",
+                                             lora_path="/dummy",
+                                             lora_int_id=lora_int_id))
+
+
 def create_dummy_sequence(request_id: int, token_ids: List[int],
                           block_size: int) -> Sequence:
     return Sequence(
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 85d408efafe96..ddbf40f089407 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -214,7 +214,7 @@ def iter_params(self, model_name: str):
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True),
     # [Encoder-decoder]
     # TODO: Implement PP
     # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index defc4e23c8ce2..62311a626bc47 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -3,11 +3,32 @@
 import torch.distributed as dist
 
 from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port
 
 if __name__ == "__main__":
     dist.init_process_group(backend="gloo")
-    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
 
-    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-    assert test_result == expected, f"Expected {expected}, got {test_result}"
-    print("Same node test passed!")
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+        test_result = all(in_the_same_node_as(pg, source_rank=0))
+
+        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+        assert test_result == expected, \
+            f"Expected {expected}, got {test_result}"
+        if pg == dist.group.WORLD:
+            print("Same node test passed! when using torch distributed!")
+        else:
+            print("Same node test passed! when using StatelessProcessGroup!")
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 2761b7f6c0644..723872682cf97 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -7,7 +7,8 @@
 import torch.distributed as dist
 
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
-from vllm.utils import update_environment_variables
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port, update_environment_variables
 
 
 def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
@@ -54,34 +55,61 @@ def wrapped_fn(env):
 
 @worker_fn_wrapper
 def worker_fn():
-    writer_rank = 2
-    broadcaster = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank)
-    if dist.get_rank() == writer_rank:
-        seed = random.randint(0, 1000)
-        dist.broadcast_object_list([seed], writer_rank)
-    else:
-        recv = [None]
-        dist.broadcast_object_list(recv, writer_rank)
-        seed = recv[0]  # type: ignore
-    dist.barrier()
-    # in case we find a race condition
-    # print the seed so that we can reproduce the error
-    print(f"Rank {dist.get_rank()} got seed {seed}")
-    # test broadcasting with about 400MB of data
-    N = 10_000
-    if dist.get_rank() == writer_rank:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            broadcaster.broadcast_object(x)
-            time.sleep(random.random() / 1000)
+
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
     else:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            y = broadcaster.broadcast_object(None)
-            assert np.array_equal(x, y)
-            time.sleep(random.random() / 1000)
-    dist.barrier()
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+
+        writer_rank = 2
+        broadcaster = MessageQueue.create_from_process_group(
+            pg, 40 * 1024, 2, writer_rank)
+        if rank == writer_rank:
+            seed = random.randint(0, 1000)
+            dist.broadcast_object_list([seed], writer_rank)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, writer_rank)
+            seed = recv[0]  # type: ignore
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+        else:
+            pg.barrier()
+
+        # in case we find a race condition
+        # print the seed so that we can reproduce the error
+        print(f"Rank {rank} got seed {seed}")
+        # test broadcasting with about 400MB of data
+        N = 10_000
+        if rank == writer_rank:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                broadcaster.broadcast_object(x)
+                time.sleep(random.random() / 1000)
+        else:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                y = broadcaster.broadcast_object(None)
+                assert np.array_equal(x, y)
+                time.sleep(random.random() / 1000)
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+            print("torch distributed passed the test!")
+        else:
+            pg.barrier()
+            print("StatelessProcessGroup passed the test!")
 
 
 def test_shm_broadcast():
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 0f7d15e1d85aa..ef74062ce4b41 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -100,6 +100,45 @@ def sample_complex_json_schema():
     }
 
 
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        '$defs': {
+            'Step': {
+                'properties': {
+                    'explanation': {
+                        'title': 'Explanation',
+                        'type': 'string'
+                    },
+                    'output': {
+                        'title': 'Output',
+                        'type': 'string'
+                    }
+                },
+                'required': ['explanation', 'output'],
+                'title': 'Step',
+                'type': 'object'
+            }
+        },
+        'properties': {
+            'steps': {
+                'items': {
+                    '$ref': '#/$defs/Step'
+                },
+                'title': 'Steps',
+                'type': 'array'
+            },
+            'final_answer': {
+                'title': 'Final Answer',
+                'type': 'string'
+            }
+        },
+        'required': ['steps', 'final_answer'],
+        'title': 'MathReasoning',
+        'type': 'object'
+    }
+
+
 @pytest.fixture
 def sample_guided_choice():
     return [
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
new file mode 100644
index 0000000000000..c2dab300ecefb
--- /dev/null
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -0,0 +1,25 @@
+from vllm import LLM, SamplingParams
+
+
+def test_gpu_memory_utilization():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # makes sure gpu_memory_utilization is per-instance limit,
+    # not a global limit
+    llms = [
+        LLM(model="facebook/opt-125m",
+            gpu_memory_utilization=0.3,
+            enforce_eager=True) for i in range(3)
+    ]
+    for llm in llms:
+        outputs = llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index de6257cfc551c..ed50ec6bbc9eb 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -104,6 +104,34 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm):
                             schema=sample_complex_json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_definition_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for solving 8x + 7 = -23 "
+        f"that fits this schema: {sample_definition_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_definition_json_schema)
+
+
 @pytest.mark.skip_global_cleanup
 def test_guided_choice_completion(sample_guided_choice, llm):
     sampling_params = SamplingParams(
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 2c53676c5f5dd..bf609b38a94f5 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -36,7 +36,7 @@ def run_lmfe(sample_regex):
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.6)
+              gpu_memory_utilization=0.3)
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     outputs = llm.generate(
         prompts=[
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a74109e2f5120..0a29d77e73abc 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -25,6 +25,7 @@ def server():
         "--max-num-seqs",
         "5",
         "--enforce-eager",
+        "--trust-remote-code",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -154,6 +155,61 @@ async def test_single_chat_session_audio_base64encoded(
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_input_audio(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@@ -211,11 +267,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
+                                          model_name: str, audio_url: str,
+                                          base64_encoded_audio: Dict[str,
+                                                                     str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str):
+                                 audio_url: str,
+                                 base64_encoded_audio: Dict[str, str]):
 
     messages = [{
         "role":
@@ -228,9 +345,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                 }
             },
             {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
                 }
             },
             {
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 4616f363cc04a..547c1fd020928 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,6 +1,8 @@
+import asyncio
 from http import HTTPStatus
 from typing import List
 
+import openai
 import pytest
 import pytest_asyncio
 import requests
@@ -103,3 +105,52 @@ async def test_check_health(server: RemoteOpenAIServer):
     response = requests.get(server.url_for("health"))
 
     assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--max-model-len", "10100"],
+                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
+            id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_request_cancellation(server: RemoteOpenAIServer):
+    # clunky test: send an ungodly amount of load in with short timeouts
+    # then ensure that it still responds quickly afterwards
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client(timeout=0.5)
+    tasks = []
+    # Request about 2 million tokens
+    for _ in range(200):
+        task = asyncio.create_task(
+            client.chat.completions.create(messages=chat_input,
+                                           model=MODEL_NAME,
+                                           max_tokens=10000,
+                                           extra_body={"min_tokens": 10000}))
+        tasks.append(task)
+
+    done, pending = await asyncio.wait(tasks,
+                                       return_when=asyncio.ALL_COMPLETED)
+
+    # Make sure all requests were sent to the server and timed out
+    # (We don't want to hide other errors like 400s that would invalidate this
+    # test)
+    assert len(pending) == 0
+    for d in done:
+        with pytest.raises(openai.APITimeoutError):
+            d.result()
+
+    # If the server had not cancelled all the other requests, then it would not
+    # be able to respond to this one within the timeout
+    client = server.get_async_client(timeout=5)
+    response = await client.chat.completions.create(messages=chat_input,
+                                                    model=MODEL_NAME,
+                                                    max_tokens=10)
+
+    assert len(response.choices) == 1
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 8d23a2be6f9bb..47c521a9b5eb5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -482,6 +482,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice1 = chat_completion.choices[0].message.content
@@ -496,6 +497,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice2 = chat_completion.choices[0].message.content
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 7565ff7192f67..a803ea4a8d6ad 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -27,7 +27,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
@@ -39,8 +39,8 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 2
-    assert score.data[0].score[0] <= 0.01
-    assert score.data[1].score[0] >= 0.9
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
 
 
 @pytest.mark.asyncio
@@ -55,7 +55,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
@@ -67,8 +67,8 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 2
-    assert score.data[0].score[0] <= 0.01
-    assert score.data[1].score[0] >= 0.9
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
 
 
 @pytest.mark.asyncio
@@ -78,7 +78,7 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     text_1 = "What is the capital of France?"
     text_2 = "The capital of France is Paris."
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
@@ -90,4 +90,4 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 1
-    assert score.data[0].score[0] >= 0.9
+    assert score.data[0].score >= 0.9
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 93660e6118ca8..5b40a04db15ee 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -30,6 +30,7 @@ class MockModelConfig:
     tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
+    logits_processor_pattern = None
 
 
 @dataclass
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 6199a75b5b4f8..096ab6fa0ac09 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -9,6 +9,7 @@
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-2-7b"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@@ -33,6 +34,16 @@ async def _async_serving_engine_init():
     return serving_engine
 
 
+@pytest.mark.asyncio
+async def test_serving_model_name():
+    serving_engine = await _async_serving_engine_init()
+    assert serving_engine._get_model_name(None) == MODEL_NAME
+    request = LoRARequest(lora_name="adapter",
+                          lora_path="/path/to/adapter2",
+                          lora_int_id=1)
+    assert serving_engine._get_model_name(request) == request.lora_name
+
+
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
     serving_engine = await _async_serving_engine_init()
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
new file mode 100644
index 0000000000000..baf8d73fdbffb
--- /dev/null
+++ b/tests/kernels/test_fused_quant_layernorm.py
@@ -0,0 +1,171 @@
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+DTYPES = [torch.bfloat16, torch.float]
+QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
+VEC_HIDDEN_SIZES = range(1024, 1030)
+# Avoid combinatorial explosion with full Cartesian product
+NUM_TOKENS_HIDDEN_SIZES = [
+    *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
+    *[(83, i) for i in [1, 1033, 2048, 5120]],
+    *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
+    *[(4096, i) for i in [1, 64, 5137]],
+]
+
+ADD_RESIDUAL = [False, True]
+SCALE_UBS = [True, False]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+EPS = 1e-6
+
+## Helpers
+
+
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+
+
+def ref_rms_norm(rms_norm_layer: RMSNorm,
+                 x: torch.Tensor,
+                 residual: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+        out, residual = rms_norm_layer.forward_native(x, residual)
+    else:
+        out = rms_norm_layer.forward_native(x)
+
+    return out, residual
+
+
+def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if scale_ub is not None:
+        assert quant_dtype == torch.float8_e4m3fn
+
+    # Norm
+    torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
+
+    # Quant
+    if quant_dtype == torch.float8_e4m3fn:
+        torch_out, scales = ops.scaled_fp8_quant(torch_out,
+                                                 scale_ub=scale_ub,
+                                                 use_per_token_if_dynamic=True)
+    else:
+        assert quant_dtype == torch.int8
+        torch_out, scales = ops.scaled_int8_quant(torch_out)
+
+    return torch_out, scales, residual
+
+
+def ref_impl(rms_norm_layer: RMSNorm,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
+                                       residual, scale_ub)
+
+
+def ops_dynamic_per_token_quant(weight: torch.Tensor,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
+                                                       quant_dtype, scale_ub,
+                                                       residual)
+    return out, scales, residual
+
+
+def ops_impl(weight: torch.Tensor,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
+                                       scale_ub)
+
+
+@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    scale_ub: bool,
+    dtype: torch.dtype,
+    quant_dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+
+    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
+        # skip
+        return
+
+    layer = RMSNorm(hidden_size, EPS).to(dtype=dtype)
+
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+
+    # Make inputs
+    scale = 1 / (hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+    if scale_ub is not None:
+        rms_x, _ = ref_rms_norm(layer, x, residual)
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+
+    ref_out, ref_scales, ref_residual = \
+        ref_impl(layer, x, quant_dtype, residual, scale_ub)
+    ops_out, ops_scales, ops_residual = \
+        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+
+    assert ref_out.dtype == quant_dtype
+    assert ops_out.dtype == quant_dtype
+    assert torch.allclose(ref_scales, ops_scales)
+    if quant_dtype == torch.int8:
+        # big atol to account for round-off errors.
+        assert torch.allclose(ref_out, ops_out, atol=1)
+    else:
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32))
+    if add_residual:
+        assert torch.allclose(ref_residual, ops_residual)
+
+    output = torch.empty_like(x, dtype=quant_dtype)
+    scales = torch.empty((x.numel() // x.shape[-1], 1),
+                         device=x.device,
+                         dtype=torch.float32)
+
+    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
+            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..34244a8fe4ca7
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,131 @@
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Optional, Tuple, Type
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 9f4d81b583141..3334c0df149b5 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,13 +1,19 @@
+import pickle
+
 import pytest
 import torch
 from transformers import AutoTokenizer
 
+from vllm.config import ModelConfig
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    get_guided_decoding_logits_processor,
+    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
 
+MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
+
 
 def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
@@ -38,14 +44,29 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+@pytest.mark.parametrize("is_local", [True, False])
+async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
+                                                 sample_regex,
                                                  sample_json_schema):
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+
+    config = ModelConfig(
+        MODEL_NAME,
+        task="generate",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     token_ids = tokenizer.encode(
         f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-    regex_lp = await get_guided_decoding_logits_processor(
-        regex_request, tokenizer)
+
+    regex_lp = get_local_guided_decoding_logits_processor(
+            regex_request, tokenizer, config) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, tokenizer, config)
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -59,7 +80,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
     json_request = GuidedDecodingParams(json=sample_json_schema,
                                         backend=backend)
     json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer)
+        json_request, tokenizer, config)
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -84,3 +105,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
     with pytest.raises(ValueError,
                        match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
+
+
+def test_pickle_xgrammar_tokenizer_data():
+
+    # TODO: move to another test file for xgrammar
+    try:
+        import xgrammar as xgr
+    except ImportError:
+        pytest.skip("Could not import xgrammar to run test")
+
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+        TokenizerData)
+    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    pickled = pickle.dumps(tokenizer_data)
+
+    assert pickled is not None
+
+    depickled: TokenizerData = pickle.loads(pickled)
+
+    assert depickled is not None
+    assert depickled.vocab_type == xgr.VocabType.RAW
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index e100c6b9bb906..c548cfdf53414 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -16,7 +16,7 @@
 
 AudioTuple = Tuple[np.ndarray, int]
 
-VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+VLLM_PLACEHOLDER = "<|audio|>"
 HF_PLACEHOLDER = "<|audio|>"
 
 CHUNKED_PREFILL_KWARGS = {
@@ -46,7 +46,8 @@ def audio(request):
 def server(request, audio_assets):
     args = [
         "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}"
+        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
+        "--trust-remote-code"
     ] + [
         f"--{key.replace('_','-')}={value}"
         for key, value in request.param.items()
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index cae25ae9fa2c8..057b04349e8b7 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,7 +1,7 @@
 import pytest
 
 from tests.utils import multi_gpu_test
-from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
 from ...utils import check_outputs_equal
@@ -189,7 +189,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 35018c3c14dee..06739e8f02253 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -5,7 +5,7 @@
 import pytest
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
 from ...utils import check_outputs_equal
@@ -200,7 +200,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 99b5d5694f9f7..bdc1571784b5d 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -3,17 +3,20 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import copy
+import json
 
+import jsonschema
+import jsonschema.exceptions
 import pytest
 
-from vllm import SamplingParams
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
     MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
 
 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
 MISTRAL_FORMAT_MODELS = [
@@ -126,6 +129,45 @@
     }
 ]
 
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -251,3 +293,43 @@ def test_mistral_function_calling(
         assert parsed_message.tool_calls[
             0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
         assert parsed_message.content is None
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with vllm_runner(model, dtype='bfloat16',
+                     tokenizer_mode="mistral") as vllm_model:
+
+        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
+                                               backend=guided_backend)
+        params = SamplingParams(max_tokens=512,
+                                temperature=0.7,
+                                guided_decoding=guided_decoding)
+
+        messages = [{
+            "role": "system",
+            "content": "you are a helpful assistant"
+        }, {
+            "role":
+            "user",
+            "content":
+            f"Give an example JSON for an employee profile that "
+            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+        }]
+        outputs = vllm_model.model.chat(messages, sampling_params=params)
+
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index c16192a1e1438..ce8ac8d8e0ceb 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -16,8 +16,8 @@
 # Wrap lazy imports to avoid initializing CUDA during test collection
 @pytest.fixture()
 def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VProcessor
-    return Phi3VProcessor
+    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
+    return Phi3VMultiModalProcessor
 
 
 @pytest.fixture()
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index ed8f34a677f84..3101d1d2ea831 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -1,7 +1,9 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+import math
 import os
+from collections import defaultdict
 from pathlib import PosixPath
 from typing import Type
 
@@ -10,11 +12,12 @@
 from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless, identity
+from vllm.utils import identity
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                           _VideoAssets)
-from ....utils import fork_new_process_for_each_test, large_gpu_mark
+from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+                       multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
@@ -382,7 +385,7 @@
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
     ### Tensor parallel / multi-gpu broadcast tests
-    "broadcast-chameleon": VLMTestInfo(
+    "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
@@ -393,43 +396,25 @@
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
         hf_output_post_proc = lambda hf_output, model: hf_output[:2],
         comparator=check_outputs_equal,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            ),
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava": VLMTestInfo(
+    "llava-broadcast": VLMTestInfo(
         models=["llava-hf/llava-1.5-7b-hf"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava_next": VLMTestInfo(
+    "llava_next-broadcast": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
     ### Custom input edge-cases for specific models
@@ -468,6 +453,41 @@
 # yapf: enable
 
 
+def _mark_splits(
+    test_settings: dict[str, VLMTestInfo],
+    *,
+    num_groups: int,
+) -> dict[str, VLMTestInfo]:
+    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
+    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
+
+    for info in test_settings.values():
+        for model in info.models:
+            test_infos_by_model[model].append(info)
+
+    models = sorted(test_infos_by_model.keys())
+    split_size = math.ceil(len(models) / num_groups)
+
+    new_test_settings = dict[str, VLMTestInfo]()
+
+    for i in range(num_groups):
+        models_in_group = models[i * split_size:(i + 1) * split_size]
+
+        for model in models_in_group:
+            for info in test_infos_by_model[model]:
+                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
+                new_info = info._replace(marks=new_marks)
+                new_test_settings[name_by_test_info_id[id(info)]] = new_info
+
+    missing_keys = test_settings.keys() - new_test_settings.keys()
+    assert not missing_keys, f"Missing keys: {missing_keys}"
+
+    return new_test_settings
+
+
+VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
+
+
 ### Test wrappers
 # Wrappers around the core test running func for:
 # - single image
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
new file mode 100644
index 0000000000000..55c2e5d4ed412
--- /dev/null
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -0,0 +1,200 @@
+import importlib.util
+import math
+from array import array
+from typing import List
+
+import openai
+import pytest
+import pytest_asyncio
+from scipy.spatial.distance import cosine
+
+import vllm
+import vllm.config
+
+from ....utils import RemoteOpenAIServer
+
+# GritLM embedding implementation is only supported by XFormers backend.
+pytest.mark.skipif(not importlib.util.find_spec("xformers"),
+                   reason="GritLM requires XFormers")
+
+MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
+MAX_MODEL_LEN = 4000
+
+
+def _arr(arr):
+    """
+    Convert a list of integers to an array of integers.
+    """
+    return array("i", arr)
+
+
+def test_find_array(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    from vllm.model_executor.models.gritlm import GritLMPooler
+
+    # Create an LLM object to get the model config.
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+
+
+@pytest.fixture(scope="module")
+def server_embedding():
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_generate():
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_embedding(server_embedding: RemoteOpenAIServer):
+    async with server_embedding.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest_asyncio.fixture
+async def client_generate(server_generate: RemoteOpenAIServer):
+    async with server_generate.get_async_client() as async_client:
+        yield async_client
+
+
+def run_llm_encode(llm: vllm.LLM, queries: List[str],
+                   instruction: str) -> List[float]:
+    outputs = llm.encode([instruction + q for q in queries], )
+    return [output.outputs.embedding for output in outputs]
+
+
+async def run_client_embeddings(client: vllm.LLM, queries: List[str],
+                                instruction: str) -> List[float]:
+    outputs = await client.embeddings.create(
+        model=MODEL_NAME,
+        input=[instruction + q for q in queries],
+    )
+    return [data.embedding for data in outputs.data]
+
+
+def gritlm_instruction(instruction):
+    return ("<|user|>\n" + instruction +
+            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+
+
+def get_test_data():
+    """
+    Grabbed this test data and the expected values from
+    README.md in https://github.com/ContextualAI/gritlm
+    """
+    q_instruction = gritlm_instruction(
+        "Given a scientific paper title, retrieve the paper's abstract")
+    queries = [
+        "Bitcoin: A Peer-to-Peer Electronic Cash System",
+        "Generative Representational Instruction Tuning",
+    ]
+
+    d_instruction = gritlm_instruction("")
+    documents = [
+        # ruff: noqa: E501
+        "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+        "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+    ]
+
+    return queries, q_instruction, documents, d_instruction
+
+
+def validate_embed_output(q_rep: List[float], d_rep: List[float]):
+    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
+    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
+
+    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
+    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
+
+    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
+    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
+
+    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
+    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
+
+
+def test_gritlm_offline_embedding(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+
+    d_rep = run_llm_encode(
+        llm,
+        documents,
+        d_instruction,
+    )
+    q_rep = run_llm_encode(
+        llm,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding(
+        client_embedding: openai.AsyncOpenAI):
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    d_rep = await run_client_embeddings(
+        client_embedding,
+        documents,
+        d_instruction,
+    )
+    q_rep = await run_client_embeddings(
+        client_embedding,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+def test_gritlm_offline_gen():
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
+    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
+    outputs = llm.generate(input, sampling_params=sampling_params)
+
+    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    outputs = await client_generate.completions.create(
+        model=MODEL_NAME,
+        prompt=input,
+        max_tokens=256,
+        temperature=0.0,
+    )
+
+    assert outputs.choices[0].text == "The capital of France is Paris."
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index 0c3115d195fc1..af31e1a635f65 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -42,7 +42,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 1
     assert len(hf_outputs) == 1
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -63,8 +63,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -85,5 +85,5 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a89518820045f..fac8c4b2e9b19 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -53,6 +53,8 @@ class _HfExamplesInfo:
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
+    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
+                                         trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
     "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
                                          trust_remote_code=True),
@@ -135,6 +137,7 @@ class _HfExamplesInfo:
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
+    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 3b728f2744fca..a4eea7f035c91 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import pytest
+import transformers
 from transformers import PretrainedConfig
 
 from vllm import LLM
@@ -11,6 +12,9 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    if (model_arch == "Cohere2ForCausalLM"
+            and transformers.__version__ < "4.48.0"):
+        pytest.skip(reason="Model introduced in HF >= 4.48.0")
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
 
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 94be215258f89..2c413a633896a 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm import LLM, PoolingParams, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
@@ -36,9 +36,8 @@ def test_oot_registration_text_generation(dummy_opt_path):
 def test_oot_registration_embedding(dummy_gemma2_embedding_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = PoolingParams()
     llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.encode(prompts, sampling_params)
+    outputs = llm.embed(prompts)
 
     for output in outputs:
         assert all(v == 0 for v in output.outputs.embedding)
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index ae668d1dd56c8..d22d778f81fa8 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,7 +1,6 @@
 from typing import cast
 
 import pytest
-from transformers import BatchFeature
 
 from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
                                         find_text_matches, find_token_matches,
@@ -16,7 +15,7 @@
 @pytest.mark.parametrize(
     ("token_ids", "match_ids", "expected"),
     [
-        ([], [], [{ "start_idx": 0, "end_idx": 0 }]),
+        ([], [], []),
         ([], [32000], []),
         (
             [32000, 32000, 32000],
@@ -83,7 +82,7 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                 "pattern_2": [32000],
             },
             {
-                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
+                "pattern_1": [],
                 "pattern_2": [],
             }
         ),
@@ -136,7 +135,7 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     result = find_token_matches(prompt, prompt_repls)
@@ -243,7 +242,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     result = find_text_matches(prompt, prompt_repls)
@@ -276,12 +275,12 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
                 "pattern_3": "!",
             },
             {
-                # Test whether target is confused with repl_unit
-                "pattern_1": ("<image><image>", 1),
-                # Test empty repl_unit
-                "pattern_2": ("", 1),
-                # Test multiple repl_count
-                "pattern_3": ("?", 2),
+                # Test whether target is confused with replacement
+                "pattern_1": "<image><image>",
+                # Test empty replacement
+                "pattern_2": "",
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": "?!?",
             },
         ),
     ]
@@ -290,8 +289,8 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
     ("mm_count", "expected"),
     [
         (0, "Image:<image>Image:<image><image>!"),
-        (1, "<image><image>Image:<image><image>??"),
-        (2, "<image><image><image><image><image>??"),
+        (1, "<image><image>Image:<image><image>?!?"),
+        (2, "<image><image><image><image><image>?!?"),
     ]
 )
 # yapf: enable
@@ -306,7 +305,7 @@ def test_find_replace_text(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     matches = find_text_matches(prompt, prompt_repls)
@@ -314,9 +313,8 @@ def test_find_replace_text(
     result = replace_text_matches(
         prompt,
         matches,
-        {key: list(range(mm_count))
+        {key: mm_count
          for key in repl_by_key},
-        BatchFeature(),
     )
 
     # Only displayed on error
@@ -343,12 +341,12 @@ def test_find_replace_text(
                 "pattern_3": [918],
             },
             {
-                # Test whether target is confused with repl_unit
-                "pattern_1": ([32000, 32000], 1),
-                # Test empty repl_unit
-                "pattern_2": ([], 1),
-                # Test multiple repl_count
-                "pattern_3": ([1550], 2),
+                # Test whether target is confused with replacement
+                "pattern_1": [32000, 32000],
+                # Test empty replacement
+                "pattern_2": [],
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": [1550, 918, 1550],
             },
         ),
     ]
@@ -357,8 +355,8 @@ def test_find_replace_text(
     ("mm_count", "expected"),
     [
         (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
-        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550]),
-        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550]),
+        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
+        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
     ]
 )
 # yapf: enable
@@ -373,7 +371,7 @@ def test_find_replace_tokens(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     matches = find_token_matches(prompt, prompt_repls)
@@ -381,9 +379,8 @@ def test_find_replace_tokens(
     result = replace_token_matches(
         prompt,
         matches,
-        {key: list(range(mm_count))
+        {key: mm_count
          for key in repl_by_key},
-        BatchFeature(),
     )
 
     # Only displayed on error
@@ -399,9 +396,9 @@ def test_find_replace_tokens(
     "repl_by_key",
     [
         {
-            "pattern_1": ([32000, 32000], 1),
-            "pattern_2": ([], 1),
-            "pattern_3": ([1550], 2),
+            "pattern_1": [32000, 32000],
+            "pattern_2": [],
+            "pattern_3": [1550, 918, 1550],
         },
     ],
 )
@@ -414,48 +411,47 @@ def test_find_replace_tokens(
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=6,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
             ],
         ),
         (
-            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550],
+            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             [
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=1,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=5,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_3",
                     start_idx=7,
-                    unit=[1550],
-                    unit_count=2,
+                    replacement=[1550, 918, 1550],
                 ),
             ],
         ),
         (
-            [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550],
+            [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             [
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=1,
-                    unit=[32000, 32000],
-                    unit_count=2,
+                    replacement=[32000, 32000],
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=3,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_3",
                     start_idx=6,
-                    unit=[1550],
-                    unit_count=2,
+                    replacement=[1550, 918, 1550],
                 ),
             ],
         ),
@@ -470,11 +466,17 @@ def test_iter_placeholders(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement([], *repl).bind(key, mock_tokenizer)
+        PromptReplacement(key, [], repl).bind(mock_tokenizer)
         for key, repl in repl_by_key.items()
     ]
 
-    result = list(iter_placeholders(prompt_repls, prompt))
+    result = list(
+        iter_placeholders(
+            prompt_repls,
+            prompt,
+            # Effectively match all occurrences in the prompt
+            {key: 3 for key in repl_by_key},
+         ))
 
     # Only displayed on error
     print("result:", result)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 2f4194a63fc25..0d90635093ac7 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -3,14 +3,14 @@
 import torch
 
 from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaProcessor,
+                                              LlavaMultiModalProcessor,
                                               get_max_llava_image_tokens)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 26add5bf6d90d..21fec990aa873 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,9 +10,11 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensors24, CompressedTensorsLinearMethod,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(
@@ -208,3 +210,98 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+    assert qkv_proj.scheme.input_quant.strategy == input_strategy
+    assert qkv_proj.scheme.quantized
+    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
+     "token"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+     "channel", "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+     "channel", "token"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.int8
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+        assert qkv_proj.scheme.weight_quant is None
+        assert qkv_proj.scheme.input_quant is None
+        assert not qkv_proj.scheme.quantized
+        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+        assert sparsity_map.get("Linear").format == "dense"
+        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a731b11eae81c..32a6b0aed66aa 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,15 +1,16 @@
 import asyncio
 import os
 import socket
-from functools import partial
 from typing import AsyncIterator, Tuple
 
 import pytest
+import torch
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, merge_async_iterators, supports_kw)
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
-from .utils import error_on_warning
+from .utils import error_on_warning, fork_new_process_for_each_test
 
 
 @pytest.mark.asyncio
@@ -24,10 +25,7 @@ async def mock_async_iterator(idx: int):
             print(f"iterator {idx} cancelled")
 
     iterators = [mock_async_iterator(i) for i in range(3)]
-    merged_iterator = merge_async_iterators(*iterators,
-                                            is_cancelled=partial(asyncio.sleep,
-                                                                 0,
-                                                                 result=False))
+    merged_iterator = merge_async_iterators(*iterators)
 
     async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
         async for idx, output in generator:
@@ -270,3 +268,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
         requires_kw_only=requires_kw_only,
         allow_var_kwargs=allow_var_kwargs
     ) == is_supported
+
+
+@fork_new_process_for_each_test
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_memory_in_bytes = \
+        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
+
+    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
+
+    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
+    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # Check that the memory usage is within 5% of the expected values
+    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
+    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert abs(torch_peak_ratio - 1) <= 0.05
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
diff --git a/tests/utils.py b/tests/utils.py
index a893667e144a6..bf3d88194e4ca 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -163,12 +163,11 @@ def get_client(self):
             api_key=self.DUMMY_API_KEY,
         )
 
-    def get_async_client(self):
-        return openai.AsyncOpenAI(
-            base_url=self.url_for("v1"),
-            api_key=self.DUMMY_API_KEY,
-            max_retries=0,
-        )
+    def get_async_client(self, **kwargs):
+        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
+                                  api_key=self.DUMMY_API_KEY,
+                                  max_retries=0,
+                                  **kwargs)
 
 
 def _test_completion(
@@ -682,10 +681,12 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
 
 
 def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
-    """Gets a pytest skipif mark, which triggers ig the the device doesn't have
-    meet a minimum memory requirement in gb; can be leveraged via 
-    @large_gpu_test to skip tests in environments without enough resources, or
-    called when filtering tests to run directly.
+    """
+    Get a pytest mark, which skips the test if the GPU doesn't meet
+    a minimum memory requirement in GB.
+    
+    This can be leveraged via `@large_gpu_test` to skip tests in environments
+    without enough resources, or called when filtering tests to run directly.
     """
     try:
         if current_platform.is_cpu():
@@ -712,26 +713,37 @@ def large_gpu_test(*, min_gb: int):
 
     Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
     """
-    test_skipif = large_gpu_mark(min_gb)
+    mark = large_gpu_mark(min_gb)
 
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_skipif(f)
+        return mark(f)
 
     return wrapper
 
 
-def multi_gpu_test(*, num_gpus: int):
-    """
-    Decorate a test to be run only when multiple GPUs are available.
-    """
-    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+def multi_gpu_marks(*, num_gpus: int):
+    """Get a collection of pytest marks to apply for `@multi_gpu_test`."""
+    test_selector = pytest.mark.distributed(num_gpus=num_gpus)
     test_skipif = pytest.mark.skipif(
         cuda_device_count_stateless() < num_gpus,
         reason=f"Need at least {num_gpus} GPUs to run the test.",
     )
 
+    return [test_selector, test_skipif]
+
+
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    marks = multi_gpu_marks(num_gpus=num_gpus)
+
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+        func = fork_new_process_for_each_test(f)
+        for mark in reversed(marks):
+            func = mark(func)
+
+        return func
 
     return wrapper
 
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index b44d3e5cb0678..ed04f0a373c51 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -2,16 +2,23 @@
 import pytest
 
 from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
 
 
-def make_request(request_id, prompt_token_ids):
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids),
+        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
+                            multi_modal_placeholders={"image": mm_positions}
+                            if mm_positions else None,
+                            multi_modal_hashes=mm_hashes),
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
@@ -38,6 +45,7 @@ def test_prefill():
     all_token_ids = common_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     computed_blocks = manager.get_computed_blocks(req0)
+    assert len(req0.kv_block_hashes) == 3
     assert not computed_blocks
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -49,7 +57,7 @@ def test_prefill():
         block_hash = hash_block_tokens(parent_block_hash, block_tokens)
         assert manager.block_pool[block_id].block_hash == block_hash
         assert manager.block_pool[block_id].ref_cnt == 1
-        parent_block_hash = block_hash
+        parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
     for block_id in (3, 4):
@@ -61,6 +69,7 @@ def test_prefill():
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks = manager.get_computed_blocks(req1)
+    assert len(req1.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
@@ -90,6 +99,7 @@ def test_prefill():
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_block = manager.get_computed_blocks(req2)
+    assert len(req2.kv_block_hashes) == 3
     assert [b.block_id for b in computed_block] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
@@ -360,11 +370,15 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     assert not computed_blocks
     # Just ask for 1 block.
     blocks = manager.allocate_slots(req, block_size, computed_blocks)
+    req.num_computed_tokens = block_size
     assert len(blocks) == 1 + num_preallocated_blocks
 
-    # Append slots to the block.
-    req.num_computed_tokens = block_size * len(blocks)  # Assume all used.
-    blocks = manager.append_slots(req, block_size)  # Append 1 block.
+    # Assume all computed.
+    manager.append_slots(req, block_size * (len(blocks) - 1))
+    req.num_computed_tokens = block_size * len(blocks)
+
+    # Append 1 block.
+    blocks = manager.append_slots(req, block_size)
     assert len(blocks) == 1 + num_preallocated_blocks
 
 
@@ -412,3 +426,77 @@ def test_cache_blocks():
     )
     assert len(manager.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
+
+
+def test_mm_prefix_caching():
+    """
+    This tests that the multi-modal prefix caching is correct.
+    """
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
+    # [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
+    common_token_ids = list(range(10)) + [-1] * 6
+    common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
+    common_token_ids += [-1] * 16
+
+    common_mm_positions = [
+        PlaceholderRange(offset=11, length=10),
+        PlaceholderRange(offset=30, length=18),
+    ]
+    common_mm_hashes = ["aaa", "bbb"]
+
+    # A unique image plus some text tokens.
+    unique_token_ids = [-1] * 7 + [100] * 4
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req0 = make_request("0",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks
+    assert len(req0.kv_block_hashes) == 3
+    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
+    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
+    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+
+    blocks = manager.allocate_slots(req0, 59, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.append_slots(req0, 5)
+    assert new_blocks is not None and len(new_blocks) == 0
+
+    # The just completed block should have hashes with extra keys.
+    assert len(req0.kv_block_hashes) == 4
+    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+
+    # Cache hit.
+    unique_token_ids = [-1] * 7 + [200] * 5
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req1 = make_request("1",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert len(computed_blocks) == 3
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index ac5e7dde525a7..ff38a4568ecb1 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -31,14 +31,6 @@ def test_prefix_caching_from_cli():
     assert engine_args.enable_prefix_caching
 
 
-def test_defaults():
-    engine_args = EngineArgs(model="facebook/opt-125m")
-
-    # Assert V1 defaults
-    assert (engine_args.enable_prefix_caching
-            ), "V1 turns on prefix caching by default"
-
-
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config: VllmConfig = engine_args.create_engine_config(
@@ -52,10 +44,3 @@ def test_defaults_with_usage_context():
         UsageContext.OPENAI_API_SERVER)
     assert vllm_config.scheduler_config.max_num_seqs == 1024
     assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
-
-
-def test_prefix_cache_disabled_with_multimodel():
-    engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
-
-    vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
-    assert not vllm_config.cache_config.enable_prefix_caching
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index fef44ac29c41f..a61ec63a365b5 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -28,6 +28,7 @@ def make_request() -> EngineCoreRequest:
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
         sampling_params=SamplingParams(),
         eos_token_id=None,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 4e003a25e91d2..2f1cbec607a91 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -30,6 +30,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
         sampling_params=params,
         eos_token_id=None,
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 2afffb5b9d1c8..a06956ce18a93 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,6 +21,8 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index a4d0c44c22b51..693128640e07d 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -26,6 +26,10 @@ do
     export QUANTIZATION=${array[0]}
     export MODEL_NAME=${array[1]}
     export REVISION=${array[2]}
+    # If array length is larger than 3, then MIN_CAPABILITY is provided
+    if [ ${#array[@]} -gt 3 ]; then
+        export MIN_CAPABILITY=${array[3]}
+    fi
     pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
 
     if [[ $LOCAL_SUCCESS == 0 ]]; then
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index d8bca05e204c0..199731bdc21fe 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,14 +1,21 @@
 import os
 
+import pytest
 import torch
 
+from vllm.platforms import current_platform
+
 MAX_MODEL_LEN = 1024
 MODEL_NAME = os.environ.get("MODEL_NAME",
                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
 
 
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
+    reason="Current system does not have minimum capability.")
 def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 5289c91f201cd..a6b3cb5759f2b 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -4,7 +4,6 @@
 import pytest
 import torch
 
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -548,7 +547,8 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     # With CUDA Graph capture and replay enabled, the decoder and encoder
     # input sequences will be padded. Create the expected padded tensors
     # accordingly.
-    graph_batch_size = VllmConfig.get_graph_batch_size(expanded_batch_size)
+    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
+        expanded_batch_size)
     cuda_graph_pad_size = graph_batch_size - expanded_batch_size
     padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
     padded_encoder_seq_lens = encoder_seq_lens + list(
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 4055524f3e0c7..aabe913c242e1 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -3,7 +3,6 @@
 import pytest
 import torch
 
-from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
                                              init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
@@ -177,7 +176,8 @@ def test_prepare_decode_cuda_graph(batch_size):
         model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
-    expected_bs = VllmConfig.get_graph_batch_size(len(seq_group_metadata_list))
+    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
+        len(seq_group_metadata_list))
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.num_prefills == 0
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 194ea2aa506f4..79233c75714de 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -31,10 +31,6 @@ def test_gpu_memory_profiling():
         is_driver_worker=True,
     )
 
-    # Load the model so we can profile it
-    worker.init_device()
-    worker.load_model()
-
     # Set 10GiB as the total gpu ram to be device-agnostic
     def mock_mem_info():
         current_usage = torch.cuda.memory_stats(
@@ -46,20 +42,24 @@ def mock_mem_info():
 
     from unittest.mock import patch
     with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        # Load the model so we can profile it
+        worker.init_device()
+        worker.load_model()
         gpu_blocks, _ = worker.determine_num_available_blocks()
 
-    # Peak vram usage by torch should be 0.7077 GiB
+    # Peak vram usage by torch should be 0.47 GiB
+    # Model weights take 0.25 GiB
     # No memory should be allocated outside of torch
     # 9.0 GiB should be the utilization target
-    # 8.2923 GiB should be available for the KV cache
+    # 8.28 GiB should be available for the KV cache
     block_size = CacheEngine.get_cache_block_size(
         engine_config.cache_config, engine_config.model_config,
         engine_config.parallel_config)
 
-    expected_blocks = (8.2923 * 1024**3) // block_size
+    expected_blocks = (8.28 * 1024**3) // block_size
 
     # Check within a small tolerance for portability
     # Hardware, kernel, or dependency changes could all affect memory
     # utilization.
-    # A 10 block tolerance here should be about 6MB of wiggle room.
-    assert abs(gpu_blocks - expected_blocks) < 10
+    # A 100 block tolerance here should be about 60MB of wiggle room.
+    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/tools/mypy.sh b/tools/mypy.sh
index e984e739d70cf..2454ff9fde466 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -29,3 +29,4 @@ run_mypy vllm/plugins
 run_mypy vllm/prompt_adapter
 run_mypy vllm/spec_decode
 run_mypy vllm/worker
+run_mypy vllm/v1
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 081076ad7dbdc..394ca8663e189 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -34,9 +34,10 @@ def get_entries(node, curr_depth=0):
                         "examples/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
-                        choices=["prefill", "decode_1"],
                         required=True,
-                        help="The phase to print the table for.")
+                        help="The phase to print the table for. This is either"
+                        "prefill or decode_n, where n is the decode step "
+                        "number")
     parser.add_argument("--table",
                         type=str,
                         choices=["summary", "model"],
@@ -49,6 +50,10 @@ def get_entries(node, curr_depth=0):
     with open(args.json_trace) as f:
         profile_data = json.load(f)
 
+    assert args.phase in profile_data, \
+       (f"Cannot find phase {args.phase} in profile data. Choose one among"
+        f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa
+
     if args.table == "summary":
         entries_and_depths = flatten_entries(
             SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index adc44474aa4c1..da7a28da15c19 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -151,16 +151,31 @@ def is_quant(op_name: str):
            "scaled_int8_quant" in op_name:
             return True
 
+    # LoRA ops
+    def is_sgmv_shrink(op_name: str):
+        return "sgmv_shrink" in op_name
+
+    def is_sgmv_expand(op_name: str):
+        return "sgmv_expand" in op_name
+
+    def is_bgmv_shrink(op_name: str):
+        return "bgmv_shrink" in op_name
+
+    def is_bgmv_expand(op_name: str):
+        return "bgmv_expand" in op_name
+
+    def is_cutlass_gemm_op(op_name: str):
+        return "void cutlass::Kernel" in op_name or \
+           "void cutlass::device_kernel" in op_name
+
     def is_gemm_op(op_name: str):
         if is_quant(op_name):
             return False
-        if "xmma_gemm" in op_name  or \
+        return is_cutlass_gemm_op(op_name) or \
+           "xmma_gemm" in op_name  or \
            "gemv2T_kernel" in op_name or \
            "splitKreduce" in op_name or \
-           "void cutlass::Kernel" in op_name or \
-           "void cutlass::device_kernel" in op_name or \
-           "s16816gemm" in op_name:
-            return True
+           "s16816gemm" in op_name
 
     def is_elementwise_op(op_name: str):
         return "elementwise_kernel" in op_name
@@ -211,6 +226,18 @@ def is_reduce_kernel(op_name: str):
     quant_ops = list(filter(lambda x: is_quant(x), ops))
     ops = list(filter(lambda x: x not in quant_ops, ops))
 
+    sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops))
+    sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_expand_ops, ops))
+    bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops))
+    bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_expand_ops, ops))
+
+    cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops))
+    ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops))
+
     gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
     ops = list(filter(lambda x: x not in gemm_ops, ops))
 
@@ -257,6 +284,24 @@ def is_reduce_kernel(op_name: str):
         trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
     if len(quant_ops):
         trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
+
+    if len(sgmv_shrink_ops):
+        trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(sgmv_expand_ops):
+        trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_shrink_ops):
+        trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_expand_ops):
+        trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+
+    if len(cutlass_gemm_ops):
+        trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
+                                                                      axis=1)
+
     if len(gemm_ops):
         trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
     if len(rms_norm_ops):
@@ -296,7 +341,9 @@ def is_reduce_kernel(op_name: str):
         trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
                                                                         axis=1)
 
-    trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
+    trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
+                  sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
+                  cutlass_gemm_ops + gemm_ops + rms_norm_ops +
                   vocab_embed_ops + mem_ops + elementwise_ops +
                   nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
                   nccl_other_ops + cross_device_reduce_1stage_ops +
@@ -315,7 +362,14 @@ def plot_trace_df(traces_df: pd.DataFrame,
                   plot_title: str,
                   output: Optional[Path] = None):
 
+    def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
+        phase_df = traces_df.query(f'phase == "{phase}"')
+        descs = phase_df['phase_desc'].to_list()
+        assert all([desc == descs[0] for desc in descs])
+        return descs[0]
+
     phases = traces_df['phase'].unique()
+    phase_descs = [get_phase_description(traces_df, p) for p in phases]
     traces_df = traces_df.pivot_table(index="phase",
                                       columns="name",
                                       values=plot_metric,
@@ -324,7 +378,8 @@ def plot_trace_df(traces_df: pd.DataFrame,
     traces_df = group_trace_by_operations(traces_df)
 
     # Make the figure
-    fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True)
+    fig_size_x = max(5, len(phases))
+    fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True)
 
     # Draw the stacked bars
     ops = list(traces_df)
@@ -332,7 +387,7 @@ def plot_trace_df(traces_df: pd.DataFrame,
     for op in ops:
         values = [traces_df[op][phase] for phase in phases]
         values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
-        ax.bar(phases, values, label=op, bottom=bottom)
+        ax.bar(phase_descs, values, label=op, bottom=bottom)
         bottom = [bottom[j] + values[j] for j in range(len(phases))]
 
     # Write the values as text on the bars
@@ -390,6 +445,14 @@ def keep_only_top_entries(df: pd.DataFrame,
                    ["name"]] = "others"
             return df
 
+        def get_phase_description(key: str) -> str:
+            num_running_seqs = profile_json[key]['metadata'][
+                'num_running_seqs']
+            if num_running_seqs is not None:
+                return f"{key}-seqs-{num_running_seqs}"
+            else:
+                return key
+
         # Get data for each key
         traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
 
@@ -413,6 +476,7 @@ def keep_only_top_entries(df: pd.DataFrame,
         # Fill in information about the step-keys
         for trace_df, step_key in zip(trace_dfs, step_keys):
             trace_df['phase'] = step_key
+            trace_df['phase_desc'] = get_phase_description(step_key)
 
         # Combine all data frames so they can be put in a single plot
         traces_df = pd.concat(trace_dfs)
@@ -426,12 +490,16 @@ def keep_only_top_entries(df: pd.DataFrame,
     def make_plot_title_suffix(profile_json: dict) -> str:
         context = profile_json["context"]
         sparsity = context.get('sparsity', None)
-        return (f"{context['model']}\n"
+        run_type = \
+            f'Run {context["num_steps"]} steps' if context['num_steps'] else \
+                (f'Complete {context["complete_num_requests_per_step"]} per '
+                 f'step; Run till completion')
+        return (f"{context['engine_args']['model']}\n"
                 f"Batch={context['batch_size']}, "
                 f"PromptLen={context['prompt_len']}, "
-                f"OutputLen={context['output_len']},"
-                f"NumGpus={context['tensor_parallel_size']}"
-                f"{', Sparsity ' + sparsity if sparsity else ''}")
+                f"NumGpus={context['engine_args']['tensor_parallel_size']}"
+                f"{', Sparsity ' + sparsity if sparsity else ''}\n"
+                f"Run Type: {run_type}")
 
     profile_json = None
     with open(json_trace) as f:
diff --git a/vllm/__init__.py b/vllm/__init__.py
index a10f6d3128cb6..45252b93e3d54 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -7,8 +7,11 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, PoolingOutput,
-                          PoolingRequestOutput, RequestOutput)
+from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
+                          CompletionOutput, EmbeddingOutput,
+                          EmbeddingRequestOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput, ScoringOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
@@ -27,6 +30,12 @@
     "CompletionOutput",
     "PoolingOutput",
     "PoolingRequestOutput",
+    "EmbeddingOutput",
+    "EmbeddingRequestOutput",
+    "ClassificationOutput",
+    "ClassificationRequestOutput",
+    "ScoringOutput",
+    "ScoringRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",
@@ -34,26 +43,3 @@
     "initialize_ray_cluster",
     "PoolingParams",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "EmbeddingOutput":
-        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingOutput
-
-    if name == "EmbeddingRequestOutput":
-        msg = ("EmbeddingRequestOutput has been renamed to "
-               "PoolingRequestOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingRequestOutput
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index c192c9a7b0e4d..f6b5514f8987d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -249,6 +249,26 @@ def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
         block_table_bound)
 
 
+# fused quant layer norm ops
+def rms_norm_dynamic_per_token_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    scale_ub: Optional[torch.Tensor] = None,
+    residual: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=quant_dtype)
+    scales = torch.empty((input.numel() // input.shape[-1], 1),
+                         device=input.device,
+                         dtype=torch.float32)
+
+    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
+                                                  scales, epsilon, scale_ub,
+                                                  residual)
+    return output, scales
+
+
 # quantization ops
 # awq
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
@@ -532,6 +552,109 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+
+    m = a.shape[0]
+    k = a.shape[1]
+    assert (k % 2 == 0)
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
+                         dtype=torch.uint8,
+                         device=a.device)
+
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
+        raise ValueError
+
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+
+    return a_nzs, a_meta
+
+
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+
+    return out
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 86e952a903f36..0cff6f5952aba 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,7 +13,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad
+from vllm.utils import make_tensor_with_pad, print_warning_once
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
 
@@ -395,7 +395,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError("Torch SPDA does not support logits soft cap.")
+            print_warning_once("Torch SPDA does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -619,7 +620,7 @@ def _run_sdpa_forward(
                 value[None, :, start_kv:end_kv, :],
                 attn_mask=mask,
                 dropout_p=0.0,
-                is_causal=causal_attn and not self.need_mask,
+                is_causal=causal_attn and mask is None,
                 scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
             output[start_q:end_q, :, :] = sub_out
             start_q, start_kv = end_q, end_kv
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index e2e989efb020c..3e59b3603d2c6 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,9 +17,7 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from vllm.utils import print_warning_once
 
 
 class XFormersBackend(AttentionBackend):
@@ -386,8 +384,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError(
-                "XFormers does not support attention logits soft capping.")
+            print_warning_once("XFormers does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index f002a8ff905b1..0c7bbfe599b02 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,6 +1,10 @@
+import ast
 import copy
 import dataclasses
+import os
+import pprint
 import time
+from collections import defaultdict
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
 from unittest.mock import patch
@@ -9,7 +13,7 @@
 import torch.fx as fx
 
 import vllm.envs as envs
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import weak_ref_tensors
 
@@ -21,6 +25,122 @@
 logger = init_logger(__name__)
 
 
+class InductorHashCache:
+    """
+    Disk format: a Python list of tuples, each tuple is
+    (runtime_shape, graph_index, hash_str)
+    We use list of tuple for readability.
+
+    In-memory format: a defaultdict of dict, where the key is
+    runtime_shape, and the value is a dict of graph_index to hash_str.
+
+    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
+    we don't use json here because json doesn't support int as key.
+
+    TODO: better off-the-shelf solution to serialize the data?
+    """
+
+    def __init__(self, cache_dir: str, disabled: bool = False):
+        self.cache: defaultdict = defaultdict(dict)
+        self.disabled = disabled
+        self.cache_dir = cache_dir
+        self.cache_file_path = os.path.join(cache_dir,
+                                            "inductor_hash_cache.py")
+        if disabled:
+            return
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+        if os.path.exists(self.cache_file_path):
+            with open(self.cache_file_path) as f:
+                self.deserialize(f.read())
+
+    def deserialize(self, data: str):
+        # we use ast.literal_eval to parse the data
+        # because it is a safe way to parse Python literals.
+        # do not use eval(), it is unsafe.
+        list_data = ast.literal_eval(data)
+        for runtime_shape, graph_index, hash_str in list_data:
+            self.cache[runtime_shape][graph_index] = hash_str
+
+    def serialize(self) -> str:
+        data = []
+        for runtime_shape, graph_index_to_hash_str in self.cache.items():
+            for graph_index, hash_str in graph_index_to_hash_str.items():
+                data.append((runtime_shape, graph_index, hash_str))
+        printer = pprint.PrettyPrinter(indent=4)
+        return printer.pformat(data)
+
+    def save_to_file(self):
+        if self.disabled:
+            return
+        with open(self.cache_file_path, "w") as f:
+            f.write(self.serialize())
+
+    def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
+        if self.disabled:
+            return False
+        runtime_shape, graph_index = key
+        return runtime_shape in self.cache and graph_index in self.cache[
+            runtime_shape]
+
+    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
+        if self.disabled:
+            raise KeyError("cannot read from disabled cache")
+        runtime_shape, graph_index = key
+        return self.cache[runtime_shape][graph_index]
+
+    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
+        # setitem for disabled cache is fine, because we
+        # don't actually write to the disk
+        runtime_shape, graph_index = key
+        self.cache[runtime_shape][graph_index] = value
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
+    """
+
+    def __init__(self) -> None:
+        self.guards: List[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
+
+
 def wrap_inductor(graph,
                   example_inputs,
                   additional_inductor_config,
@@ -55,9 +175,93 @@ def wrap_inductor(graph,
     # inductor can inplace modify the graph, so we need to copy it
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
-    compiled_graph = compile_fx(graph,
-                                example_inputs,
-                                config_patches=current_config)
+
+    cache_data = compilation_config.inductor_hash_cache
+    if (runtime_shape, graph_index) in cache_data:
+        # we compiled this graph before
+        # so we can directly lookup the compiled graph via hash
+        hash_str = cache_data[(runtime_shape, graph_index)]
+        if graph_index == 0:
+            # adds some info logging for the first graph
+            logger.info(
+                "Directly lookup the graph for shape %s from the cache",
+                str(runtime_shape))  # noqa
+        logger.debug(
+            "directly lookup the %s-th graph for shape %s via hash %s",
+            graph_index, str(runtime_shape), hash_str)
+        from torch._inductor.codecache import FxGraphCache
+        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+            inductor_compiled_graph = FxGraphCache._lookup_graph(
+                hash_str, example_inputs, True, False)
+            assert inductor_compiled_graph is not None, (
+                "Inductor cache lookup failed. Please remove"
+                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+            )
+
+        # Inductor calling convention (function signature):
+        # f(list) -> tuple
+        # Dynamo calling convention (function signature):
+        # f(*args) -> Any
+
+        # need to know if the graph returns a tuple
+        from torch._inductor.compile_fx import graph_returns_tuple
+        returns_tuple = graph_returns_tuple(graph)
+
+        # this is the graph we return to Dynamo to run
+        def compiled_graph(*args):
+            # convert args to list
+            list_args = list(args)
+            graph_output = inductor_compiled_graph(list_args)
+            # unpack the tuple if needed
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+    else:
+        # it's the first time we compile this graph
+        # the assumption is that we don't have nested Inductor compilation.
+        # compiled_fx_graph_hash will only be called once, and we can hook
+        # it to get the hash of the compiled graph directly.
+        from torch._inductor.codecache import compiled_fx_graph_hash
+
+        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+            out = compiled_fx_graph_hash(*args, **kwargs)
+            # store the hash in the cache
+            nonlocal cache_data
+            cache_data[(runtime_shape, graph_index)] = out[0]
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                logger.info("Cache the graph of shape %s for later use",
+                            str(runtime_shape))
+            logger.debug("store the %s-th graph for shape %s via hash %s",
+                         graph_index, str(runtime_shape), out[0])
+            return out
+
+        def _check_can_cache(*args, **kwargs):
+            # no error means it can be cached.
+            # Inductor refuses to cache the graph outside of Dynamo
+            # tracing context, and also disables caching for graphs
+            # with high-order ops.
+            # For vLLM, in either case, we want to cache the graph.
+            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
+            return
+
+        def _get_shape_env():
+            return AlwaysHitShapeEnv()
+
+        with patch(# for hijacking the hash of the compiled graph
+                "torch._inductor.codecache.compiled_fx_graph_hash",
+                hijack_compiled_fx_graph_hash), \
+            patch(# for providing a dummy shape environment
+                "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                 _get_shape_env), \
+            patch(# for forcing the graph to be cached
+                "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                _check_can_cache):
+            compiled_graph = compile_fx(graph,
+                                        example_inputs,
+                                        config_patches=current_config)
 
     # after compiling the last graph, record the end time
     if graph_index == num_graphs - 1:
@@ -149,14 +353,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     """
 
     def __init__(self, module: torch.fx.GraphModule,
-                 compile_submod_names: List[str],
-                 compilation_configs: CompilationConfig, graph_pool):
+                 compile_submod_names: List[str], vllm_config: VllmConfig,
+                 graph_pool):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
         self.compile_submod_names = compile_submod_names
-        self.compilation_configs = compilation_configs
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
+        self.vllm_config = vllm_config
 
     def run(self, *args):
         fake_args = [
@@ -182,15 +387,15 @@ def call_module(self, target: torch.fx.node.Target,
             compiled_graph_for_general_shape = wrap_inductor(
                 submod,
                 args,
-                self.compilation_configs.inductor_compile_config,
-                self.compilation_configs,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
 
             self.module.__dict__[target] = PiecewiseBackend(
-                submod, self.compilation_configs, self.graph_pool, index,
+                submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_general_shape)
 
@@ -211,7 +416,8 @@ class VllmBackend:
     which handles the post-grad passes.
     """
 
-    compilation_configs: CompilationConfig
+    vllm_config: VllmConfig
+    compilation_config: CompilationConfig
     graph_pool: Any
     _called: bool = False
     # the graph we compiled
@@ -227,7 +433,7 @@ class VllmBackend:
 
     def __init__(
         self,
-        compilation_configs: CompilationConfig,
+        vllm_config: VllmConfig,
     ):
         global global_graph_pool
         if global_graph_pool is None:
@@ -244,13 +450,14 @@ def __init__(
         self.sym_tensor_indices = []
         self.input_buffers = []
 
-        self.compilation_configs = compilation_configs
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
     def configure_post_pass(self):
-        config = self.compilation_configs
+        config = self.compilation_config
         self.post_grad_pass_manager.configure(config.pass_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
@@ -271,7 +478,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         from .monitor import torch_compile_start_time
         dynamo_time = time.time() - torch_compile_start_time
         logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
-        self.compilation_configs.compilation_time += dynamo_time
+        self.compilation_config.compilation_time += dynamo_time
 
         # we control the compilation process, each instance can only be
         # called once
@@ -281,12 +488,14 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         self.configure_post_pass()
 
         self.split_gm, self.piecewise_graphs = split_graph(
-            graph, self.compilation_configs.splitting_ops)
+            graph, self.compilation_config.splitting_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
-        logger.debug("%s", lazy_format_graph_code("before split", self.graph))
-        logger.debug("%s", lazy_format_graph_code("after split",
-                                                  self.split_gm))
+
+        # depyf will hook lazy_format_graph_code and dump the graph
+        # for debugging, no need to print the graph here
+        lazy_format_graph_code("before split", self.graph)
+        lazy_format_graph_code("after split", self.split_gm)
 
         compilation_counter.num_piecewise_graphs_seen += len(
             self.piecewise_graphs)
@@ -298,13 +507,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.compilation_configs,
+                                    self.vllm_config,
                                     self.graph_pool).run(*example_inputs)
 
         self._called = True
 
-        if not self.compilation_configs.use_cudagraph or \
-            not self.compilation_configs.cudagraph_copy_inputs:
+        if not self.compilation_config.use_cudagraph or \
+            not self.compilation_config.cudagraph_copy_inputs:
             return self.split_gm
 
         # if we need to copy input buffers for cudagraph
@@ -364,10 +573,9 @@ class ConcreteSizeEntry:
 
 class PiecewiseBackend:
 
-    def __init__(self, graph: fx.GraphModule,
-                 compilation_configs: CompilationConfig, graph_pool: Any,
-                 piecewise_compile_index: int, total_piecewise_compiles: int,
-                 sym_shape_indices: List[int],
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: List[int],
                  compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
@@ -375,7 +583,7 @@ def __init__(self, graph: fx.GraphModule,
 
         We will compile `self.graph` once for the general shape,
         and then compile for different shapes specified in
-        `compilation_configs.compile_sizes`.
+        `compilation_config.compile_sizes`.
 
         Independently, we will capture cudagraph for different shapes.
 
@@ -383,7 +591,8 @@ def __init__(self, graph: fx.GraphModule,
         compile it first, and then capture cudagraph.
         """
         self.graph = graph
-        self.compilation_configs = compilation_configs
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
@@ -393,10 +602,10 @@ def __init__(self, graph: fx.GraphModule,
             piecewise_compile_index == total_piecewise_compiles - 1)
 
         self.compile_sizes: Set[int] = set(
-            self.compilation_configs.compile_sizes)
+            self.compilation_config.compile_sizes)
         self.capture_sizes: Set[int] = set(
-            self.compilation_configs.capture_sizes
-        ) if self.compilation_configs.use_cudagraph else set()
+            self.compilation_config.capture_sizes
+        ) if self.compilation_config.use_cudagraph else set()
 
         self.first_run_finished = False
 
@@ -423,7 +632,7 @@ def __call__(self, *args) -> Any:
             self.first_run_finished = True
             # no specific sizes to compile
             if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.compilation_configs)
+                end_monitoring_torch_compile(self.vllm_config)
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -443,28 +652,31 @@ def __call__(self, *args) -> Any:
             entry.runnable = wrap_inductor(
                 self.graph,
                 args,
-                self.compilation_configs.inductor_compile_config,
-                self.compilation_configs,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.compilation_configs)
+
+                # save the hash of the inductor graph for the next run
+                self.compilation_config.inductor_hash_cache.save_to_file()
+                end_monitoring_torch_compile(self.vllm_config)
 
         if not entry.use_cudagraph:
             return entry.runnable(*args)
 
         if entry.cudagraph is None:
-            if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups:  # noqa
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
                 entry.num_finished_warmup += 1
                 if self.is_first_graph:
                     logger.debug(
                         "Warming up %s/%s for shape %s",
                         entry.num_finished_warmup,
-                        self.compilation_configs.cudagraph_num_of_warmups,
+                        self.compilation_config.cudagraph_num_of_warmups,
                         runtime_shape)
                 return entry.runnable(*args)
 
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 938430fe2a501..805a217ee6ca1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -185,7 +185,7 @@ def __call__(self, *args, **kwargs):
                             "Unsupported dynamic dimensions"
                             f" {dims} for argument {k} with type {type(arg)}.")
             # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config.compilation_config)
+            start_monitoring_torch_compile(self.vllm_config)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 3584cc3608caf..e15d7b315c50f 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -6,7 +6,8 @@
 
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
@@ -53,14 +54,16 @@ def __call__(self, graph: torch.fx.Graph):
                 self.insert_defunctionalized(graph, node)
                 self._remove(node)
 
-            # These 2 replacements avoid the most copies for LLaMa.
+            # rms_norm replacements avoid the most copies for LLaMa.
             elif at_target == torch.ops._C.fused_add_rms_norm.default:
                 mutated_args = {1: 'input', 2: 'residual'}
                 self.defunctionalize(graph, node, mutated_args)
             elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
                 mutated_args = {1: 'result', 2: 'residual'}
                 self.defunctionalize(graph, node, mutated_args)
-
+            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
             elif at_target in [
                     torch.ops._C.rms_norm.default,
                     torch.ops._C.rms_norm_static_fp8_quant.default
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 5efa410fab6a0..cde27bd108212 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,129 +1,517 @@
-import operator
-from typing import Iterable, List, Optional
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
+import torch._inductor.pattern_matcher as pm
+# TODO(luka) use vllm.utils once #10836 landed
+from compressed_tensors.quantization import FP8_DTYPE
+from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
-from torch._inductor.pattern_matcher import (Match, PatternMatcherPass,
-                                             fwd_only, register_replacement)
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._ops import OpOverload
 
 from vllm.config import CompilationConfig
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import find_getitem_maybe
+from .multi_output_match import MultiOutputMatch
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
 
-def rms_pattern_static(result: torch.Tensor, result_rms: torch.Tensor,
-                       input: torch.Tensor, weight: torch.Tensor,
-                       scale: torch.Tensor):
-    at1 = auto_functionalized(torch.ops._C.rms_norm.default,
-                              result=result_rms,
-                              input=input,
-                              weight=weight,
-                              epsilon=1e-5)
-    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at1[1],
-                              scale=scale)
-
-    # result
-    return at2[1]
-
-
-def rms_replacement_static(result: torch.Tensor, result_rms: torch.Tensor,
-                           input: torch.Tensor, weight: torch.Tensor,
-                           scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.rms_norm_static_fp8_quant.default,
-                             result=result,
-                             input=input,
-                             weight=weight,
-                             scale=scale,
-                             epsilon=1e-5)
-
-    # result
-    return at[1]
-
-
-def rms_pattern_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                residual: torch.Tensor, weight: torch.Tensor,
-                                scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.fused_add_rms_norm.default,
-                             input=input,
-                             residual=residual,
-                             weight=weight,
-                             epsilon=1e-5)
-    at1 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at[1],
-                              scale=scale)
-
-    # result, residual
-    return at1[1], at[2]
-
-
-def rms_replacement_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                    residual: torch.Tensor,
-                                    weight: torch.Tensor, scale: torch.Tensor):
-    at = auto_functionalized(
-        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-        result=result,
-        input=input,
-        residual=residual,
-        weight=weight,
-        scale=scale,
-        epsilon=1e-5)
-    # result, residual
-    return at[1], at[2]
-
-
 def empty_bf16(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
 
 
-def empty_fp8(*args, **kwargs):
-    fp8 = torch.float8_e4m3fn
-    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
-
-
 def empty_fp32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
 
 
-# Utilities for post-processing multi-output matches
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+
+
+class QuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of quantization.
+    dtype: quantized data type
+    static: static quantization if True, dynamic if False
+    per_tensor: per-tensor quantization if True, per-token if False
+    symmetric: symmetric if True, asymmetric if False
+    """
+    dtype: torch.dtype
+    static: bool
+    per_tensor: bool = True
+    symmetric: bool = True
+
+    def __str__(self):
+        return (f"QuantKey({'static' if self.static else 'dynamic'},"
+                f"{fx.graph.dtype_abbrs[self.dtype]},"
+                f"{'per_tensor' if self.per_tensor else 'per_token'},"
+                f"{'a' if not self.symmetric else ''}symmetric)")
+
+
+kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, True, True)
+kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True)
+kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True)
+
+QUANT_OPS: Dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTensorSym:
+    torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTokenSym:
+    torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa
+}
+
+
+class FusedRMSQuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of RMSNorm + quant fusion.
+    quant: type of quantization
+    fused_add: does the op also perform the residual add
+    """
+    quant: QuantKey
+    fused_add: bool
+
+    def __str__(self):
+        return (f"FusedQuantKey({self.quant}, with"
+                f"{'' if self.fused_add else 'out'} residual)")
+
+
+FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = {
+    FusedRMSQuantKey(kFp8StaticTensorSym, False):
+    torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8StaticTensorSym, True):
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, False):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, True):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+}
+
+
+class QuantMultiOutputMatch(MultiOutputMatch):
+
+    def __init__(self, match: pm.Match, quant_op, fused_op):
+        super().__init__(match)
+        assert isinstance(quant_op, OpOverload)
+        assert isinstance(fused_op, OpOverload)
+        self.QUANT_OP = quant_op  # in-place quant op
+        self.FUSED_OP = fused_op  # in-place fused quant op
+
+    def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node,
+                                                                      int]],
+                          **kwargs):
+        """
+        This utility function inserts an auto-functionalized node for FUSED_OP.
+        It also correctly sets its meta value and rebinds the users of the
+        unfused nodes to use the fused node instead.
+
+        :param fused_return_mapping: A dictionary, mapping from getitem indices
+        of the fused node result to a tuple of the old node and a getitem index.
+        :param kwargs: kwargs that get directly forwarded to the auto_fn node
+
+        Example:
+        If we want to replace this graph:
+        _, x1, x2 = auto_fn(op1)
+        _, y1, y2 = auto_fn(op2)
+
+        with
+        _, x1, y2, x2 = auto_fn(FUSED_OP)
+
+        we would call:
+        insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)}
+
+        Note that the 0th element is None for auto-functionalized in-place ops.
+        Hence, others appear 1-indexed.
+        """
+        fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs)
+        indices = fused_return_mapping.keys()
+        getitem_nodes = self.insert_getitems(fused_node, indices)
+
+        # Prepare the meta value, use a list so it's mutable
+        meta_val = [None] * (max(indices) + 1)
+
+        # Iterate through elements of the tuple produced by fused_node
+        for idx, getitem_node in zip(indices, getitem_nodes):
+            old_node, old_idx = fused_return_mapping[idx]
+
+            # If the old value was never used, the old_getitem might not exist
+            old_getitem = find_getitem_maybe(old_node, old_idx)
+            if old_getitem is not None:
+                # Rebind the users of match getitem nodes to use the new nodes.
+                # The old nodes will be removed by DCE at the end of the pass.
+                old_getitem.replace_all_uses_with(getitem_node)
+                getitem_node.meta["val"] = old_getitem.meta["val"]
+
+            # Extract the appropriate meta value
+            # It is present even if the getitem node does not exist
+            meta_val[idx] = old_node.meta["val"][old_idx]
+
+        # Fix the meta value on the new fused node
+        fused_node.meta["val"] = tuple(meta_val)
+
+
+class RMSNormQuantPattern:
+
+    def __init__(self, epsilon: float, key: FusedRMSQuantKey):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
+
+        assert key.quant in QUANT_OPS, \
+            f"unsupported quantization scheme {key.quant}"
+        self.QUANT_OP = QUANT_OPS[key.quant]
+
+        assert key in FUSED_OPS, \
+            f"unsupported fused rmsnorm+quant op for {key}"
+        self.FUSED_OP = FUSED_OPS[key]
+
+
+class RMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        fused_key = FusedRMSQuantKey(fused_add=False,
+                                     quant=QuantKey(dtype=quant_dtype,
+                                                    static=True,
+                                                    per_tensor=True,
+                                                    symmetric=symmetric))
+        super().__init__(epsilon, fused_key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        # Cannot use methods, as the self argument affects tracing
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale)
+
+            # result
+            return at2[1]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result
+            return at[1]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only,
+                                pm_pass)
+
+
+class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=True,
+                                              per_tensor=True,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale)
+
+            # result, residual
+            return at1[1], at[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result, residual
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 1
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and residual.
+            # The auto_fn node returns a tuple of (None, result, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # residual_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                # 0 is always None
+                fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)}
+                self.insert_fused_node(fused_return_mapping,
+                                       epsilon=rms_node.kwargs["epsilon"],
+                                       **kwargs)
+
+
+class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=False,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, scale
+            return at2[1], at2[2]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=None)
+
+            # result, scale
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
 
+    class Match(QuantMultiOutputMatch):
 
-# Returns the first auto_functionalized node with the given op (if it exists)
-def find_auto_fn_maybe(nodes: Iterable[torch.fx.Node],
-                       op) -> Optional[torch.fx.Node]:
-    for node in nodes:
-        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
-            return node
-    return None
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
 
+            assert len(rms_node.users) == 1
+            assert len(quant_node.users) == 2
 
-# Returns the first auto_functionalized node with the given op
-def find_auto_fn(nodes: Iterable[torch.fx.Node], op) -> torch.fx.Node:
-    node = find_auto_fn_maybe(nodes, op)
-    assert node is not None, f"Could not find {op} in nodes {nodes}"
-    return node
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and scale.
+            # The auto_fn node returns a tuple of (None, result, scale).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+                del kwargs["result_rms"]  # not used in the fused op
+
+                fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)}
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    residual=None,  # not used but required
+                    **kwargs)
+
+
+class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool = True,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, residual, scale
+            return at1[1], at[2], at1[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=residual)
+
+            # result, residual, scale
+            return at[1], at[3], at[2]
 
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
 
-# Returns the getitem node that extracts the idx-th element from node
-# (if it exists)
-def find_getitem_maybe(node: torch.fx.Node,
-                       idx: int) -> Optional[torch.fx.Node]:
-    for user in node.users:
-        if is_func(user, operator.getitem) and user.args[1] == idx:
-            return user
-    return None
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
 
+    class Match(QuantMultiOutputMatch):
 
-# Returns the getitem node that extracts the idx-th element from node
-def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node:
-    ret = find_getitem_maybe(node, idx)
-    assert ret is not None, f"Could not find getitem {idx} in node {node}"
-    return ret
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 2
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract result, scale, and residual.
+            # The auto_fn node returns a tuple (None, result, scale, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            # residual_node_new = at[3]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                fused_return_mapping = {
+                    1: (quant_node, 1),  # result
+                    2: (quant_node, 2),  # scale
+                    3: (rms_node, 2),  # residual
+                }
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    **kwargs)
 
 
 class FusionPass(VllmInductorPass):
@@ -158,41 +546,39 @@ def __init__(self, config: CompilationConfig.PassConfig):
             "FusionPass singleton instance already exists"
         super().__init__(config)
 
-        self.matches: List[Match] = []
+        self.matches: List[MultiOutputMatch] = []
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="fusion_pass")
 
-        # Fuse rms_norm + static_scaled_fp8_quant into
-        # rms_norm_static_fp8_quant
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_static, rms_replacement_static,
-                             inputs, fwd_only, self.patterns)
+        for epsilon in [1e-5, 1e-6]:
+            # Fuse rms_norm + static fp8 quant
+            RMSNormStaticQuantPattern(epsilon,
+                                      FP8_DTYPE).register(self.patterns)
 
-        # Fuse fused_add_rms_norm + static_scaled_fp8_quant into
-        # fused_add_rms_norm_static_fp8_quant
-        # Because pattern has 2 outputs, we need to manually process the match
-        # (see process_matches)
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_residual_static,
-                             rms_replacement_residual_static,
-                             inputs,
-                             fwd_only,
-                             self.patterns,
-                             extra_check=lambda m: self.record_match(m))
-
-    def record_match(self, match: Match) -> bool:
+            # Matches for patterns below have 2 or more outputs,
+            # so we need to process them manually (see process_matches)
+
+            # Fuse rms_norm + static fp8 quant
+            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns, self.record_match)
+
+            # Fuse rms_norm + dynamic per-token fp8 quant
+            RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE,
+                                       per_tensor=False).register(
+                                           self.patterns, self.record_match)
+
+            # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
+            FusedAddRMSNormDynamicQuantPattern(epsilon,
+                                               FP8_DTYPE,
+                                               per_tensor=False).register(
+                                                   self.patterns,
+                                                   self.record_match)
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+    def record_match(self, match: MultiOutputMatch) -> bool:
         # Hijack the extra_check to record the match and
         # save it for post-processing.
         self.matches.append(match)
@@ -200,83 +586,20 @@ def record_match(self, match: Match) -> bool:
         # Return False to prevent automatic replacement.
         return False
 
-    def process_matches(self, graph: torch.fx.Graph):
+    def process_matches(self, graph: fx.Graph):
         """
         Manually process multi-output matches and replace them with fused nodes.
-        This is necessary because the automatic replacement for multi-output
-        matches is broken: https://github.com/pytorch/pytorch/issues/137280
+        See MultiOutputMatch for more details.
         """
         for match in self.matches:
-            # To avoid use-before-definition errors, insert replacement nodes
-            # after the last node in the match.
-            # match.nodes is not guaranteed to be sorted.
-            # Find the last node in the match.
-            for last_node_in_match in reversed(graph.nodes):
-                if last_node_in_match in match.nodes:
-                    break
-            else:
-                raise ValueError("No nodes in graph")
-
-            # Insert a new auto_functionalized node for the fused operation,
-            # as well as getitem nodes to extract the result and residual.
-            # The auto_functionalized node returns a tuple of
-            # (None, result, residual) - None is the function return value.
-            # The resulting graph looks like this:
-            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
-            # result_node_new = at[1]
-            # residual_node_new = at[2]
-            with graph.inserting_after(last_node_in_match):
-                kwargs = match.kwargs
-                kwargs["epsilon"] = 1e-5  # Currently hard-coded in RMSNorm
-
-                fused_node = graph.call_function(
-                    auto_functionalized,
-                    (torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-                     ),
-                    kwargs=kwargs)
-
-                graph.inserting_after(fused_node)
-                result_node_new = graph.call_function(operator.getitem,
-                                                      (fused_node, 1))
-                residual_node_new = graph.call_function(
-                    operator.getitem, (fused_node, 2))
-
-            # Last part of replacement is rebinding the users of nodes in the
-            # match to use the new nodes.
-
-            # Find the nodes in the match that we need to rebind
-            rms_node = find_auto_fn(match.nodes,
-                                    torch.ops._C.fused_add_rms_norm.default)
-            quant_node = find_auto_fn(
-                match.nodes, torch.ops._C.static_scaled_fp8_quant.default)
-
-            assert len(rms_node.users) == 2
-            assert len(quant_node.users) == 1
-
-            # meta["val"] is used by de-functionalization and has to contain the
-            # value of the node (tuple of tensors) that would be returned by the
-            # functionalized node during tracing.
-
-            rms_tup = rms_node.meta["val"]
-            quant_tup = quant_node.meta["val"]
-
-            # The result of fused_node must be a tuple with the first element
-            # None (the function return value) and the remaining elements
-            # representing the mutated inputs.
-            fused_tup = (None, quant_tup[1], rms_tup[1], rms_tup[2])
-            fused_node.meta["val"] = fused_tup
-
-            # Find the getitem nodes and replace their uses with the new nodes.
-            # The old nodes will be removed by DCE at the end of the pass.
-            find_getitem(rms_node, 2).replace_all_uses_with(residual_node_new)
-            find_getitem(quant_node, 1).replace_all_uses_with(result_node_new)
+            match.process()
 
         # Finally, remove matched nodes
         graph.eliminate_dead_code()
         assert all(node not in graph.nodes for match in self.matches
-                   for node in match.nodes)
+                   for node in match.match.nodes)
 
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: fx.Graph):
         self.begin()
         self.dump_graph(graph, "before_fusion")
 
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
new file mode 100644
index 0000000000000..924e26f2e262e
--- /dev/null
+++ b/vllm/compilation/fx_utils.py
@@ -0,0 +1,42 @@
+import operator
+from typing import Iterable, Optional
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._ops import OpOverload
+
+
+def is_func(node: fx.Node, target) -> bool:
+    return node.op == "call_function" and node.target == target
+
+
+# Returns the first auto_functionalized node with the given op (if it exists)
+def find_auto_fn_maybe(nodes: Iterable[fx.Node],
+                       op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
+            return node
+    return None
+
+
+# Returns the first auto_functionalized node with the given op
+def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_auto_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the getitem node that extracts the idx-th element from node
+# (if it exists)
+def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+    for user in node.users:
+        if is_func(user, operator.getitem) and user.args[1] == idx:
+            return user
+    return None
+
+
+# Returns the getitem node that extracts the idx-th element from node
+def find_getitem(node: fx.Node, idx: int) -> fx.Node:
+    ret = find_getitem_maybe(node, idx)
+    assert ret is not None, f"Could not find getitem {idx} in node {node}"
+    return ret
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 3348674b09af2..b97e40415b41b 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,19 +1,36 @@
+import os
 import time
 
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
+context_manager = None
 torch_compile_start_time: float = 0.0
 
 
-def start_monitoring_torch_compile(compilation_config: CompilationConfig):
+def start_monitoring_torch_compile(vllm_config: VllmConfig):
     global torch_compile_start_time
     torch_compile_start_time = time.time()
 
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE and \
+        compilation_config.debug_dump_path:
+        import depyf
+        path = os.path.join(compilation_config.debug_dump_path,
+                            f"rank_{vllm_config.parallel_config.rank}")
+        global context_manager
+        context_manager = depyf.prepare_debug(path)
+        context_manager.__enter__()
 
-def end_monitoring_torch_compile(compilation_config: CompilationConfig):
+
+def end_monitoring_torch_compile(vllm_config: VllmConfig):
+    compilation_config: CompilationConfig = vllm_config.compilation_config
     if compilation_config.level == CompilationLevel.PIECEWISE:
         logger.info("torch.compile takes %.2f s in total",
                     compilation_config.compilation_time)
+        global context_manager
+        if context_manager is not None:
+            context_manager.__exit__(None, None, None)
+            context_manager = None
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
new file mode 100644
index 0000000000000..0ad648abfbb3a
--- /dev/null
+++ b/vllm/compilation/multi_output_match.py
@@ -0,0 +1,105 @@
+import abc
+import operator
+from abc import abstractmethod
+from typing import Iterable, List, Tuple
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor import pattern_matcher as pm
+from torch._ops import OpOverload
+
+from vllm.compilation.fx_utils import find_auto_fn
+
+
+class MultiOutputMatch(abc.ABC):
+    """
+    This class provides utilities to process multi-output matches and
+    manually insert replacements.
+
+    This is necessary because the automatic replacement for multi-output
+    matches is broken: https://github.com/pytorch/pytorch/issues/137280
+    """
+
+    def __init__(self, match: pm.Match):
+        self.match = match
+
+    @abstractmethod
+    def process(self):
+        """
+        Process a multi-output match and manually insert the replacement.
+
+        This method should:
+        1. Insert the replacement nodes after the last node in the match.
+        2. Rebind the users of nodes in the match to use the new nodes.
+        3. Set meta["val"] for de-functionalization.
+
+        The result of an auto-functionalized node is a tuple of tensors.
+        The first element is the return value of the function, usually None.
+        The remaining elements are the mutated args of the function.
+
+        All auto-functionalized nodes must contain a proper meta["val"],
+        as it is used by de-functionalization. meta["val"] has to contain the
+        value of the node (tuple of tensors) that would be returned by the
+        functionalized node during tracing.
+
+        Existing nodes in the graph all have this property set, but we have
+        to set it manually for new nodes we insert.
+
+        Example:
+        # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None
+        at = auto_functionalized(torch.ops._C.foo.default, a, b, c)
+        # at.meta["val"] = (None, a, c)
+        """
+        raise NotImplementedError
+
+    @property
+    def nodes(self) -> List[fx.Node]:
+        return self.match.nodes
+
+    @property
+    def graph(self) -> fx.Graph:
+        return self.match.graph
+
+    def find_auto_fn(self, op) -> fx.Node:
+        """
+        Find the first auto_functionalized node with the given op in the match.
+        """
+        return find_auto_fn(self.nodes, op)
+
+    def inserting_after_match(self):
+        """
+        Insert nodes after the last node in the match.
+        This is done to avoid use-before-definition errors after inserting
+        replacement nodes.
+        """
+
+        # match.nodes is not guaranteed to be sorted.
+        # Find the last node in the match.
+        for last_node_in_match in reversed(self.graph.nodes):
+            if last_node_in_match in self.match.nodes:
+                break
+        else:
+            raise ValueError("No nodes in graph")
+
+        return self.graph.inserting_after(last_node_in_match)
+
+    def insert_getitems(self, tuple_node: fx.Node,
+                        indices: Iterable[int]) -> Tuple[fx.Node, ...]:
+        """
+        Insert operator.getitem nodes to extract elements from a tuple node.
+
+        :param tuple_node: The tuple node to extract elements from.
+        :param indices: The indices of the elements to extract.
+        :return: Tuple of the new getitem nodes, corresponding to the indices.
+        """
+        with self.graph.inserting_after(tuple_node):
+            return tuple(
+                self.graph.call_function(operator.getitem, (tuple_node, idx))
+                for idx in indices)
+
+    def insert_auto_fn(self, op: OpOverload, kwargs):
+        """
+        Insert an auto_functionalized node with the given op and kwargs.
+        """
+        return self.graph.call_function(auto_functionalized, (op, ),
+                                        kwargs=kwargs)
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
index 63a369fe8d966..ba28b1f0be7bd 100644
--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
@@ -5,7 +5,8 @@
 
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index dbf6b8f7789e1..b8c52a7f46838 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -16,10 +16,6 @@
 logger = init_logger(__name__)
 
 
-def is_func(node: torch.fx.Node, target) -> bool:
-    return node.op == "call_function" and node.target == target
-
-
 class VllmInductorPass(InductorPass):
     """
     An inductor pass with access to vLLM PassConfig.
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index bc4d292fef402..c10241b483169 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -32,8 +32,8 @@ def __init__(self,
             # default compilation settings
             # compiling the forward method
 
-            backend = get_current_vllm_config(
-            ).compilation_config.init_backend()
+            vllm_config = get_current_vllm_config()
+            backend = vllm_config.compilation_config.init_backend(vllm_config)
 
             compiled_callable = torch.compile(
                 self.forward,
diff --git a/vllm/config.py b/vllm/config.py
index ff8e508a2b0e1..db1274e3afa13 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,6 +3,7 @@
 import enum
 import hashlib
 import json
+import os
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -147,46 +148,78 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
+        mm_cache_preprocessor: If true, then enables caching of the multi-modal 
+            preprocessor/mapper. Otherwise, the mapper executes each time, and 
+            for better performance consider enabling frontend process.
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
             can not be gathered from the vllm arguments.
         override_pooler_config: Initialize non default pooling config or
             override default pooling config for the pooling model.
+        logits_processor_pattern: Optional regex pattern specifying valid
+            logits processor qualified names that can be passed with the
+            `logits_processors` extra completion argument. Defaults to None, 
+            which allows no processors.
     """
 
-    def __init__(
-            self,
-            model: str,
-            task: Union[TaskOption, Literal["draft"]],
-            tokenizer: str,
-            tokenizer_mode: str,
-            trust_remote_code: bool,
-            dtype: Union[str, torch.dtype],
-            seed: int,
-            allowed_local_media_path: str = "",
-            revision: Optional[str] = None,
-            code_revision: Optional[str] = None,
-            rope_scaling: Optional[Dict[str, Any]] = None,
-            rope_theta: Optional[float] = None,
-            tokenizer_revision: Optional[str] = None,
-            max_model_len: Optional[int] = None,
-            spec_target_max_model_len: Optional[int] = None,
-            quantization: Optional[str] = None,
-            quantization_param_path: Optional[str] = None,
-            enforce_eager: Optional[bool] = None,
-            max_seq_len_to_capture: Optional[int] = None,
-            max_logprobs: int = 20,
-            disable_sliding_window: bool = False,
-            skip_tokenizer_init: bool = False,
-            served_model_name: Optional[Union[str, List[str]]] = None,
-            limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
-            use_async_output_proc: bool = True,
-            config_format: ConfigFormat = ConfigFormat.AUTO,
-            hf_overrides: Optional[HfOverrides] = None,
-            mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-            override_neuron_config: Optional[Dict[str, Any]] = None,
-            override_pooler_config: Optional["PoolerConfig"] = None) -> None:
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.model)
+        factors.append(self.dtype)
+        factors.append(self.quantization)
+        factors.append(self.quantization_param_path)
+        factors.append(self.revision)
+        factors.append(self.code_revision)
+        factors.append(self.trust_remote_code)
+        factors.append(self.rope_scaling)
+        factors.append(self.rope_theta)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __init__(self,
+                 model: str,
+                 task: Union[TaskOption, Literal["draft"]],
+                 tokenizer: str,
+                 tokenizer_mode: str,
+                 trust_remote_code: bool,
+                 dtype: Union[str, torch.dtype],
+                 seed: int,
+                 allowed_local_media_path: str = "",
+                 revision: Optional[str] = None,
+                 code_revision: Optional[str] = None,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 rope_theta: Optional[float] = None,
+                 tokenizer_revision: Optional[str] = None,
+                 max_model_len: Optional[int] = None,
+                 spec_target_max_model_len: Optional[int] = None,
+                 quantization: Optional[str] = None,
+                 quantization_param_path: Optional[str] = None,
+                 enforce_eager: Optional[bool] = None,
+                 max_seq_len_to_capture: Optional[int] = None,
+                 max_logprobs: int = 20,
+                 disable_sliding_window: bool = False,
+                 skip_tokenizer_init: bool = False,
+                 served_model_name: Optional[Union[str, List[str]]] = None,
+                 limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+                 use_async_output_proc: bool = True,
+                 config_format: ConfigFormat = ConfigFormat.AUTO,
+                 hf_overrides: Optional[HfOverrides] = None,
+                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+                 mm_cache_preprocessor: bool = False,
+                 override_neuron_config: Optional[Dict[str, Any]] = None,
+                 override_pooler_config: Optional["PoolerConfig"] = None,
+                 logits_processor_pattern: Optional[str] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -195,6 +228,8 @@ def __init__(
         self.seed = seed
         self.revision = revision
         self.code_revision = code_revision
+        self.rope_scaling = rope_scaling
+        self.rope_theta = rope_theta
 
         if hf_overrides is None:
             hf_overrides = {}
@@ -251,6 +286,7 @@ def __init__(
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
+        self.mm_cache_preprocessor = mm_cache_preprocessor
 
         # Set enforce_eager to False if the value is unset.
         if self.enforce_eager is None:
@@ -311,6 +347,7 @@ def __init__(
         self.task: Final = task
 
         self.pooler_config = self._init_pooler_config(override_pooler_config)
+        self.logits_processor_pattern = logits_processor_pattern
 
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -822,6 +859,24 @@ class CacheConfig:
         cpu_offload_gb: Size of the CPU offload buffer in GiB.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.cache_dtype)
+        # `cpu_offload_gb` does not use `torch.compile` yet.
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __init__(
         self,
         block_size: int,
@@ -862,6 +917,10 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        if (current_platform.is_cuda() and self.block_size is not None
+                and self.block_size > 32):
+            raise ValueError("CUDA Paged Attention kernel only supports "
+                             f"block sizes up to 32. Got {self.block_size}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
@@ -918,6 +977,24 @@ class TokenizerPoolConfig:
     pool_type: Union[str, Type["BaseTokenizerGroup"]]
     extra_config: dict
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if self.pool_type not in ("ray", ) and not isinstance(
                 self.pool_type, type):
@@ -1000,6 +1077,24 @@ class LoadConfig:
         default_factory=dict)
     ignore_patterns: Optional[Union[List[str], str]] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         model_loader_extra_config = self.model_loader_extra_config or {}
         if isinstance(model_loader_extra_config, str):
@@ -1063,6 +1158,19 @@ class ParallelConfig:
 
     rank: int = 0
 
+    def compute_hash(self):
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.pipeline_parallel_size)
+        factors.append(self.tensor_parallel_size)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
     def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
@@ -1176,6 +1284,14 @@ class SchedulerConfig:
 
     is_multimodal_model: bool = False
 
+    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
+    # calculate the actual values from the configurations.
+    # Multimodal encoder run compute budget, only used in V1
+    max_num_encoder_input_tokens = 16384
+
+    # Multimodal encoder cache size, only used in V1
+    encoder_cache_size = 16384
+
     # Whether to perform preemption by swapping or
     # recomputation. If not specified, we determine the mode as follows:
     # We use recomputation by default since it incurs lower overhead than
@@ -1199,6 +1315,24 @@ class SchedulerConfig:
 
     chunked_prefill_enabled: bool = field(init=False)
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self) -> None:
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
@@ -1276,6 +1410,25 @@ class DeviceConfig:
     device: Optional[torch.device]
     device_type: str
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # the device/platform information will be summarized
+        # by torch/vllm automatically.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
@@ -1303,6 +1456,24 @@ class SpeculativeConfig:
     decoding with top-1 proposals.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # spec decode does not use `torch.compile` yet.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def maybe_create_spec_config(
         target_model_config: ModelConfig,
@@ -1743,6 +1914,24 @@ class LoRAConfig:
     long_lora_scaling_factors: Optional[Tuple[float]] = None
     bias_enabled: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # LoRA is not compatible with `torch.compile` .
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         # Setting the maximum rank to 256 should be able to satisfy the vast
         # majority of applications.
@@ -1792,6 +1981,24 @@ class PromptAdapterConfig:
     max_cpu_prompt_adapters: Optional[int] = None
     prompt_adapter_dtype: Optional[torch.dtype] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
 
         if self.max_prompt_adapters < 1:
@@ -1820,6 +2027,24 @@ class MultiModalConfig:
     for each :class:`~vllm.multimodal.MultiModalPlugin`.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     # TODO: Add configs to init vision tower or not.
 
 
@@ -1859,6 +2084,24 @@ class PoolerConfig:
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def from_json(json_str: str) -> "PoolerConfig":
         return PoolerConfig(**json.loads(json_str))
@@ -2093,6 +2336,24 @@ class DecodingConfig:
     # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
     guided_decoding_backend: str = 'xgrammar'
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
         backend = self.guided_decoding_backend
@@ -2114,6 +2375,24 @@ class ObservabilityConfig:
     # If set, collects the model execute time for the request.
     collect_model_execute_time: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if not is_otel_available() and self.otlp_traces_endpoint is not None:
             raise ValueError(
@@ -2155,19 +2434,38 @@ class KVTransferConfig(BaseModel):
     # The KV connector port, used to build distributed connection
     kv_port: int = 14579
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @classmethod
     def from_cli(cls, cli_value: str) -> "KVTransferConfig":
         """Parse the CLI value for the kv cache transfer config."""
         return KVTransferConfig.model_validate_json(cli_value)
 
     def model_post_init(self, __context: Any) -> None:
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
         if all([
-                self.kv_connector is not None,
-                self.kv_connector != "PyNcclConnector"
+                self.kv_connector is not None, self.kv_connector
+                not in supported_kv_connector
         ]):
             raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
                              f"Supported connectors are "
-                             f"`PyNcclConnector`.")
+                             f"{supported_kv_connector}.")
 
         if self.kv_role is not None and self.kv_role not in [
                 "kv_producer", "kv_consumer", "kv_both"
@@ -2222,6 +2520,10 @@ class CompilationConfig(BaseModel):
             - 1: dynamo as is.
             - 2: dynamo once.
             - 3: piecewise compilation.
+        - debug_dump_path: the path to dump the debug information.
+        - cache_dir: the directory to store the compiled graph, to
+            accelerate Inductor compilation. By default, it will use
+            model-related information to generate a cache directory.
         - backend: the backend for compilation. It needs to be a string.
             - "" (empty string): use the default backend.
             - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
@@ -2289,12 +2591,11 @@ class CompilationConfig(BaseModel):
         certain small batchsizes, where inductor is good at optimizing.
     """ # noqa
     level: int = 0
+    debug_dump_path: str = ""
+    cache_dir: str = ""
     backend: str = ""
     custom_ops: List[str] = Field(default_factory=list)
-    splitting_ops: List[str] = Field(default_factory=lambda: [
-        "vllm.unified_attention",
-        "vllm.unified_attention_with_output",
-    ])
+    splitting_ops: List[str] = Field(default=None)  # type: ignore
 
     use_inductor: bool = True
     candidate_compile_sizes: Optional[List[int]] = Field(default=None)
@@ -2347,17 +2648,61 @@ def model_post_init(self, __context: Any) -> None:
     # not configurable, computed after init
     compile_sizes: List[int] = PrivateAttr
     capture_sizes: List[int] = PrivateAttr
+    max_capture_size: int = PrivateAttr
+    # optimization:
+    # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
+    # since we know all keys are in a range [0, max_capture_size],
+    # we can optimize it to List[int] for better lookup performance.
+    bs_to_padded_graph_size: List[int] = PrivateAttr
 
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
     compilation_time: float = PrivateAttr
+    # should be InductorHashCache, but Pydantic does not support it
+    inductor_hash_cache: Any = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.level)
+        factors.append(self.backend)
+        factors.append(self.custom_ops)
+        factors.append(self.splitting_ops)
+        factors.append(self.use_inductor)
+        factors.append(self.inductor_compile_config)
+        factors.append(self.inductor_passes)
+        factors.append(self.pass_config.uuid())
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __repr__(self) -> str:
+        exclude = {
+            "static_forward_context",
+            "enabled_custom_ops",
+            "disabled_custom_ops",
+            "compilation_time",
+            "bs_to_padded_graph_size",
+            "pass_config",
+        }
+        return self.model_dump_json(exclude=exclude, exclude_unset=True)
+
+    __str__ = __repr__
+
     @classmethod
     def from_cli(cls, cli_value: str) -> "CompilationConfig":
         """Parse the CLI value for the compilation config."""
@@ -2373,6 +2718,27 @@ def model_post_init(self, __context: Any) -> None:
         count_all = self.custom_ops.count("all")
         assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
 
+        if self.splitting_ops is None:
+            if envs.VLLM_USE_V1:
+                # v1 must split the graph on attention ops
+                # for piecewise cudagraph
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+            else:
+                # v0 can use full graph compilation without splitting,
+                # splitting is optional.
+                # right now we still need it. kv cache shape
+                # will be included in the graph if we don't split
+                # the graph.
+                # TODO: hide kv cache in static forward context
+                # so that inductor does not see it.
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):
                 assert callable(v), (
@@ -2394,7 +2760,7 @@ def model_post_init(self, __context: Any) -> None:
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
-    def init_backend(self) -> Union[str, Callable]:
+    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
@@ -2412,8 +2778,32 @@ def init_backend(self) -> Union[str, Callable]:
         # TODO: pass user-specified backend to piecewise compilation
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
+
+        if not self.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+            hash_key = vllm_config.compute_hash()
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+            os.makedirs(cache_dir, exist_ok=True)
+            self.cache_dir = cache_dir
+
+            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+            from vllm.compilation.backends import InductorHashCache
+            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+                self.cache_dir, disabled=disabled)
+            if disabled:
+                logger.info("vLLM's torch.compile cache is disabled.")
+            else:
+                logger.info(
+                    "Using cache directory: %s for vLLM's torch.compile",
+                    self.cache_dir)
+
         from vllm.compilation.backends import VllmBackend
-        return VllmBackend(self)
+        return VllmBackend(vllm_config)
 
     def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
         """To complete the initialization of config,
@@ -2443,18 +2833,22 @@ def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
 
         # sort to make sure cudagraph capture sizes are in descending order
         self.capture_sizes.sort(reverse=True)
+        self.max_capture_size = self.capture_sizes[
+            0] if self.capture_sizes else 0
 
-
-_BATCH_SIZE_ALIGNMENT = 8
-# all the token sizes that **can** be captured by cudagraph.
-# they can be arbitrarily large.
-# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192.
-# the actual sizes to capture will be determined by the model,
-# depending on the model's max_num_seqs.
-# NOTE: get_graph_batch_size needs to be updated if this list is changed.
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)
-]
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_capture_size + 1)
+        ]
+        for end, start in zip(self.capture_sizes,
+                              self.capture_sizes[1:] + [0]):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
+        self.bs_to_padded_graph_size[
+            self.max_capture_size] = self.max_capture_size
 
 
 @dataclass
@@ -2484,40 +2878,73 @@ class VllmConfig:
                                                  init=True)  # type: ignore
     instance_id: str = ""
 
-    @staticmethod
-    def get_graph_batch_size(batch_size: int) -> int:
-        """Returns the padded batch size given actual batch size.
-
-        Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
-        2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
+    def compute_hash(self) -> str:
         """
-        if batch_size <= 2:
-            return batch_size
-        elif batch_size <= 4:
-            return 4
-        else:
-            return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
-                    _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-
-    @staticmethod
-    def get_max_graph_batch_size(max_num_seqs: int) -> int:
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
         """
-        max_num_seqs: Maximum number of sequences in a batch.
-        _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture.
-
-        pad the max_num_seqs if necessary by calling get_graph_batch_size,
-        which will deal with some edge cases like 1, 2, 4.
-
-        if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded
-        size. if not, it means the padded size is larger than the largest size
-        in _BATCH_SIZES_TO_CAPTURE, return the largest size in
-        _BATCH_SIZES_TO_CAPTURE.
-        """
-        padded_size = VllmConfig.get_graph_batch_size(max_num_seqs)
-        if padded_size in _BATCH_SIZES_TO_CAPTURE:
-            return padded_size
-        assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-        return _BATCH_SIZES_TO_CAPTURE[-1]
+        factors: List[Any] = []
+        # summarize system state
+        from torch._inductor.codecache import CacheBase
+        system_factors = CacheBase.get_system()
+        factors.append(system_factors)
+
+        # summarize pytorch state
+        from torch._inductor.codecache import torch_key
+        torch_factors = torch_key()
+        factors.append(torch_factors)
+
+        # summarize vllm config
+        vllm_factors: List[Any] = []
+        from vllm import __version__
+        vllm_factors.append(__version__)
+        if self.model_config:
+            vllm_factors.append(self.model_config.compute_hash())
+        if self.cache_config:
+            vllm_factors.append(self.cache_config.compute_hash())
+        if self.parallel_config:
+            vllm_factors.append(self.parallel_config.compute_hash())
+        if self.scheduler_config:
+            vllm_factors.append(self.scheduler_config.compute_hash())
+        if self.device_config:
+            vllm_factors.append(self.device_config.compute_hash())
+        if self.load_config:
+            vllm_factors.append(self.load_config.compute_hash())
+        if self.lora_config:
+            vllm_factors.append(self.lora_config.compute_hash())
+        if self.speculative_config:
+            vllm_factors.append(self.speculative_config.compute_hash())
+        if self.decoding_config:
+            vllm_factors.append(self.decoding_config.compute_hash())
+        if self.observability_config:
+            vllm_factors.append(self.observability_config.compute_hash())
+        if self.prompt_adapter_config:
+            vllm_factors.append(self.prompt_adapter_config.compute_hash())
+        if self.quant_config:
+            pass  # should be captured by model_config.quantization
+        if self.compilation_config:
+            vllm_factors.append(self.compilation_config.compute_hash())
+        if self.kv_transfer_config:
+            vllm_factors.append(self.kv_transfer_config.compute_hash())
+
+        factors.append(vllm_factors)
+
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        return hash_str
+
+    def pad_for_cudagraph(self, batch_size: int) -> int:
+        # if batch_size > self.compilation_config.max_capture_size,
+        # it should raise an IndexError.
+        # the caller should make sure the batch_size is within the range,
+        # i.e., batch_size <= self.compilation_config.max_capture_size
+        return self.compilation_config.bs_to_padded_graph_size[batch_size]
 
     @staticmethod
     def _get_quantization_config(
@@ -2611,27 +3038,7 @@ def __post_init__(self):
             self.compilation_config.pass_config.enable_reshape = False
             self.compilation_config.level = CompilationLevel.PIECEWISE
 
-        if not envs.VLLM_USE_V1:
-            max_batchsize_to_capture = 0
-            if self.scheduler_config is not None and \
-                self.model_config is not None and \
-                    not self.model_config.enforce_eager:
-                max_batchsize_to_capture = \
-                    self.get_max_graph_batch_size(
-                    self.scheduler_config.max_num_seqs)
-            batch_size_capture_list = [
-                size for size in _BATCH_SIZES_TO_CAPTURE
-                if size <= max_batchsize_to_capture
-            ]
-        else:
-            batch_size_capture_list = []
-            if self.model_config is not None and \
-                not self.model_config.enforce_eager:
-                batch_size_capture_list = [1, 2, 4
-                                           ] + [i for i in range(8, 513, 8)]
-
-        self.compilation_config.init_with_cudagraph_sizes(
-            batch_size_capture_list)
+        self._set_cudagraph_sizes()
 
         if self.cache_config is not None and \
             self.cache_config.cpu_offload_gb > 0 and \
@@ -2652,6 +3059,70 @@ def __post_init__(self):
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+    def _set_cudagraph_sizes(self):
+        """
+        cudagraph batchsize padding logic:
+
+        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
+        batch sizes that cudagraph will capture.
+
+        Depending on the engine's configuration of `max_num_seqs`, the
+        candidate batch sizes to capture cudagraph will shrink to the subset
+        which just cover the range of `[1, max_num_seqs]`. In the common case,
+        `max_num_seqs` is 256, and the cudagraph batch sizes will be
+        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
+
+        However, if users specify the cudagraph capture sizes through
+        compilation config, we will use the specified sizes instead.
+
+        In the end, `vllm_config.compilation_config.capture_sizes` will be the
+        final sizes to capture cudagraph (in descending order).
+
+        During runtime, if batchsize is larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        no cudagraph will be used.
+        If the batch size is no larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        we can quickly find the padded graph size for a given batch size by
+        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
+        """
+
+        # calculate the default `batch_size_capture_list`
+        if not envs.VLLM_USE_V1:
+            batch_size_capture_list = []
+            max_batchsize_to_capture = 0
+            if self.scheduler_config is not None and \
+                self.model_config is not None and \
+                    not self.model_config.enforce_eager:
+
+                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
+                # find the minimum size that is larger than max_num_seqs,
+                # which then becomes the max_batchsize_to_capture
+                larger_sizes = [
+                    x for x in possible_sizes
+                    if x >= self.scheduler_config.max_num_seqs
+                ]
+                if larger_sizes:
+                    max_batchsize_to_capture = larger_sizes[0]
+                else:
+                    max_batchsize_to_capture = possible_sizes[-1]
+
+                # filter out the sizes that are
+                # larger than max_batchsize_to_capture
+                batch_size_capture_list = [
+                    size for size in possible_sizes
+                    if size <= max_batchsize_to_capture
+                ]
+        else:
+            batch_size_capture_list = []
+            if self.model_config is not None and \
+                not self.model_config.enforce_eager:
+                batch_size_capture_list = [1, 2, 4
+                                           ] + [i for i in range(8, 513, 8)]
+
+        self.compilation_config.init_with_cudagraph_sizes(
+            batch_size_capture_list)
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r},"
@@ -2684,9 +3155,10 @@ def __str__(self):
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
+            f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, "  # noqa
             f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
-            f"pooler_config={self.model_config.pooler_config!r},"
-            f" compilation_config={self.compilation_config!r}")
+            f"pooler_config={self.model_config.pooler_config!r}, "
+            f"compilation_config={self.compilation_config!r}")
 
 
 _current_vllm_config: Optional[VllmConfig] = None
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d10cb29ef4a7c..dca0b3fe8d304 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -80,7 +80,8 @@ def get_num_required_blocks(token_ids: List[int],
 
     def allocate(self,
                  token_ids: List[int],
-                 device: Device = Device.GPU) -> None:
+                 device: Device = Device.GPU,
+                 extra_hash: Optional[int] = None) -> None:
         """Allocates memory blocks for storing the given sequence of token IDs.
 
         This method allocates the required number of blocks to store the given
@@ -90,12 +91,16 @@ def allocate(self,
             token_ids (List[int]): The sequence of token IDs to be stored.
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefixcaching block.
         """
         assert not self._is_allocated
         assert token_ids
         blocks = self._allocate_blocks_for_token_ids(prev_block=None,
                                                      token_ids=token_ids,
-                                                     device=device)
+                                                     device=device,
+                                                     extra_hash=extra_hash)
         self.update(blocks)
         self._num_full_slots = len(token_ids)
 
@@ -108,7 +113,8 @@ def update(self, blocks: List[Block]) -> None:
     def append_token_ids(self,
                          token_ids: List[int],
                          num_lookahead_slots: int = 0,
-                         num_computed_slots: Optional[int] = None) -> None:
+                         num_computed_slots: Optional[int] = None,
+                         extra_hash: Optional[int] = None) -> None:
         """Appends a sequence of token IDs to the existing blocks in the
         BlockTable.
 
@@ -130,6 +136,9 @@ def append_token_ids(self,
                 Without sliding window, None can be passed.
                 Without chunked prefill, it should be the same as
                 _num_full_slots.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         assert self._is_allocated, "no blocks have been allocated"
         assert len(self._blocks) > 0
@@ -149,7 +158,8 @@ def append_token_ids(self,
         # Ensure there are enough empty slots for the new tokens plus
         # lookahead slots
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
-                                    num_lookahead_slots)
+                                    num_lookahead_slots,
+                                    extra_hash=extra_hash)
 
         # Update the blocks with the new tokens
         first_block_idx = self._num_full_slots // self._block_size
@@ -160,7 +170,9 @@ def append_token_ids(self,
 
         self._num_full_slots += len(token_ids)
 
-    def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
+    def ensure_num_empty_slots(self,
+                               num_empty_slots: int,
+                               extra_hash: Optional[int] = None) -> None:
         """Ensures that the BlockTable has at least the specified number of
         empty slots available.
 
@@ -171,6 +183,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
 
         Args:
             num_empty_slots (int): The minimum number of empty slots required.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         # Currently the block table only supports
         # appending tokens to GPU blocks.
@@ -187,7 +202,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
             assert len(self._blocks) > 0
             self._blocks.append(
                 self._allocator.allocate_mutable_block(
-                    prev_block=self._blocks[-1], device=device))
+                    prev_block=self._blocks[-1],
+                    device=device,
+                    extra_hash=extra_hash))
 
     def fork(self) -> "BlockTable":
         """Creates a new BlockTable instance with a copy of the blocks from the
@@ -259,9 +276,12 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
         # ones after the appended ones.
         return sequence_token_ids[self.num_full_slots:]
 
-    def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
-                                       token_ids: List[int],
-                                       device: Device) -> List[Block]:
+    def _allocate_blocks_for_token_ids(
+            self,
+            prev_block: Optional[Block],
+            token_ids: List[int],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         blocks: List[Block] = []
 
         block_token_ids = []
@@ -275,8 +295,10 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
         if block_token_ids:
             blocks.extend(
                 self._allocator.allocate_immutable_blocks(
-                    prev_block, block_token_ids=block_token_ids,
-                    device=device))
+                    prev_block,
+                    block_token_ids=block_token_ids,
+                    device=device,
+                    extra_hash=extra_hash))
             prev_block = blocks[-1]
 
         if tail_token_ids:
@@ -284,7 +306,7 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
             cur_token_ids = tail_token_ids[0]
 
             block = self._allocator.allocate_mutable_block(
-                prev_block=prev_block, device=device)
+                prev_block=prev_block, device=device, extra_hash=extra_hash)
             block.append_token_ids(cur_token_ids)
 
             blocks.append(block)
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index eb190adfbe802..c03b5932eafb6 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -177,7 +177,8 @@ def __init__(self, block_size: int, create_block: Block.Factory,
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
+                                   block_id=None,
+                                   extra_hash=None))
 
     def increase_pool(self):
         """Doubles the internal pool size
@@ -194,10 +195,15 @@ def increase_pool(self):
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
-
-    def init_block(self, prev_block: Optional[Block], token_ids: List[int],
-                   block_size: int, physical_block_id: Optional[int]) -> Block:
+                                   block_id=None,
+                                   extra_hash=None))
+
+    def init_block(self,
+                   prev_block: Optional[Block],
+                   token_ids: List[int],
+                   block_size: int,
+                   physical_block_id: Optional[int],
+                   extra_hash: Optional[int] = None) -> Block:
         if len(self._free_ids) == 0:
             self.increase_pool()
             assert len(self._free_ids) > 0
@@ -210,7 +216,8 @@ def init_block(self, prev_block: Optional[Block], token_ids: List[int],
             token_ids=token_ids,
             block_size=block_size,
             allocator=block._allocator,  # type: ignore[attr-defined] 
-            block_id=physical_block_id)
+            block_id=physical_block_id,
+            extra_hash=extra_hash)
         block.pool_id = pool_id  # type: ignore[attr-defined]
         return block
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 3197af3c2b7a4..3a57487a6cd8a 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -121,23 +121,32 @@ def allocate_or_get_null_block(self) -> Block:
                 self.allocate_mutable_block(None, Device.GPU))
         return self._null_block
 
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
             prev_block (Optional[Block]): The previous block to in the sequence.
                 Used for prefix hashing.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated mutable block.
         """
-        return self._allocators[device].allocate_mutable_block(prev_block)
-
-    def allocate_immutable_blocks(self, prev_block: Optional[Block],
-                                  block_token_ids: List[List[int]],
-                                  device: Device) -> List[Block]:
+        return self._allocators[device].allocate_mutable_block(
+            prev_block, extra_hash=extra_hash)
+
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         """Allocates a new group of immutable blocks with the provided block 
         token IDs on the specified device.
 
@@ -147,17 +156,22 @@ def allocate_immutable_blocks(self, prev_block: Optional[Block],
             block_token_ids (List[int]): The list of block token IDs to be 
                 stored in the new blocks.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             List[Block]: The newly allocated list of immutable blocks 
                 containing the provided block token IDs.
         """
         return self._allocators[device].allocate_immutable_blocks(
-            prev_block, block_token_ids)
+            prev_block, block_token_ids, extra_hash=extra_hash)
 
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -167,13 +181,16 @@ def allocate_immutable_block(self, prev_block: Optional[Block],
             token_ids (List[int]): The list of token IDs to be stored in the new
                 block.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
         return self._allocators[device].allocate_immutable_block(
-            prev_block, token_ids)
+            prev_block, token_ids, extra_hash=extra_hash)
 
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
@@ -387,6 +404,10 @@ def is_full(self):
     def prev_block(self):
         return self._proxy.prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def computed(self):
         return self._proxy.computed
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 06f4851af3466..985a1098b6cd1 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -50,6 +50,11 @@ def is_full(self) -> bool:
     def prev_block(self) -> Optional["Block"]:
         pass
 
+    @property
+    @abstractmethod
+    def extra_hash(self) -> Optional[int]:
+        return None
+
     @property
     @abstractmethod
     def computed(self) -> bool:
@@ -81,6 +86,8 @@ def __call__(
             block_size: int,
             allocator: "BlockAllocator",
             block_id: Optional[int] = None,
+            computed: bool = False,
+            extra_hash: Optional[int] = None,
         ) -> "Block":
             pass
 
@@ -99,18 +106,20 @@ def content_hash(self) -> Optional[int]:
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
     def allocate_immutable_block(self, prev_block: Optional[Block],
-                                 token_ids: List[int]) -> Block:
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_blocks(
-            self, prev_block: Optional[Block],
-            block_token_ids: List[List[int]]) -> List[Block]:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  extra_hash: Optional[int]) -> List[Block]:
         pass
 
     @abstractmethod
@@ -197,14 +206,18 @@ def find_cached_blocks_prefix(
 class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
@@ -213,6 +226,7 @@ def allocate_immutable_blocks(
         prev_block: Optional[Block],
         block_token_ids: List[List[int]],
         device: Device,
+        extra_hash: Optional[int] = None,
     ) -> List[Block]:
         pass
 
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index a2af5ad6362c1..9b94918ab38ef 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -63,6 +63,7 @@ def __init__(
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
         the previous block.
@@ -85,6 +86,7 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         assert device is None
         num_blocks = len(block_token_ids)
@@ -106,6 +108,7 @@ def allocate_immutable_blocks(
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block, linked to the previous block.
 
@@ -355,7 +358,8 @@ def __init__(self,
                  block_size: int,
                  allocator: BlockAllocator,
                  block_id: Optional[int] = None,
-                 _cow_target: Optional[Block] = None):
+                 _cow_target: Optional[Block] = None,
+                 extra_hash: Optional[int] = None):
         self._token_ids: List[int] = []
         self._block_size = block_size
         self._prev_block = prev_block
@@ -441,6 +445,10 @@ def block_size(self) -> int:
     def prev_block(self) -> Optional["Block"]:
         return self._prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def content_hash(self) -> Optional[int]:
         return None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index b736167f6ceb4..1238303234deb 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -126,6 +126,7 @@ def _create_block(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ) -> Block:
         # Bind block to self.
         allocator = self
@@ -137,11 +138,13 @@ def _create_block(
             block_id=block_id,
             allocator=allocator,
             computed=computed,
+            extra_hash=extra_hash,
         )
 
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
@@ -160,7 +163,8 @@ def allocate_immutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=token_ids,
                                             block_size=self._block_size,
-                                            physical_block_id=None)
+                                            physical_block_id=None,
+                                            extra_hash=extra_hash)
         assert block.content_hash is not None
 
         cached_block_id = self._cached_blocks.get(block.content_hash, None)
@@ -173,7 +177,7 @@ def allocate_immutable_block(self,
         self._block_pool.free_block(block)
 
         # No cached block => Allocate a new block
-        block = self.allocate_mutable_block(prev_block)
+        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
         block.append_token_ids(token_ids)
         return block
 
@@ -181,17 +185,20 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         blocks = []
         for token_ids in block_token_ids:
             prev_block = self.allocate_immutable_block(prev_block=prev_block,
                                                        token_ids=token_ids,
-                                                       device=device)
+                                                       device=device,
+                                                       extra_hash=extra_hash)
             blocks.append(prev_block)
         return blocks
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a mutable block. If there are no free blocks, this will
         evict unused cached blocks.
@@ -210,7 +217,8 @@ def allocate_mutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=[],
                                             block_size=self._block_size,
-                                            physical_block_id=block_id)
+                                            physical_block_id=block_id,
+                                            extra_hash=extra_hash)
         assert not block.computed
         assert block.content_hash is None
         return block
@@ -382,7 +390,8 @@ def fork(self, last_block: Block) -> List[Block]:
                 prev_block=prev_block,
                 token_ids=block.token_ids,
                 block_size=self._block_size,
-                physical_block_id=block_id)
+                physical_block_id=block_id,
+                extra_hash=block.extra_hash)
 
             forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
@@ -608,10 +617,12 @@ def swap_in(self, blocks: List[Block]) -> None:
             # existing "block" object
             if block.is_full:
                 tmp_block = self.allocate_immutable_block(
-                    prev_block=block.prev_block, token_ids=block.token_ids)
+                    prev_block=block.prev_block,
+                    token_ids=block.token_ids,
+                    extra_hash=block.extra_hash)
             else:
                 tmp_block = self.allocate_mutable_block(
-                    prev_block=block.prev_block)
+                    prev_block=block.prev_block, extra_hash=block.extra_hash)
                 tmp_block.append_token_ids(block.token_ids)
 
             block_id = tmp_block.block_id
@@ -679,6 +690,8 @@ class PrefixCachingBlock(Block):
             caching block allocator associated with this block.
         block_id (Optional[int], optional): The physical block index
             of this block. Defaults to None.
+        extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
     """
 
     def __init__(
@@ -689,6 +702,7 @@ def __init__(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ):
         assert isinstance(allocator, PrefixCachingBlockAllocator), (
             "Currently this class is only tested with "
@@ -702,6 +716,7 @@ def __init__(
         self._allocator = allocator
         self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
         self._computed = computed
+        self._extra_hash = extra_hash
 
         # On the first time, we create the block object, and next we only
         # reinitialize it
@@ -811,6 +826,10 @@ def token_ids(self) -> List[int]:
     def prev_block(self) -> Optional[Block]:
         return self._prev_block
 
+    @property
+    def extra_hash(self) -> Optional[int]:
+        return self._extra_hash
+
     @property
     def content_hash(self) -> Optional[int]:
         """Return the content-based hash of the current block, or None if it is
@@ -841,18 +860,19 @@ def content_hash(self) -> Optional[int]:
         self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
             is_first_block,
             prev_block_hash,
-            cur_block_token_ids=self.token_ids)
+            cur_block_token_ids=self.token_ids,
+            extra_hash=self._extra_hash)
         return self._cached_content_hash
 
     @staticmethod
-    def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
-                          cur_block_token_ids: List[int]) -> int:
+    def hash_block_tokens(is_first_block: bool,
+                          prev_block_hash: Optional[int],
+                          cur_block_token_ids: List[int],
+                          extra_hash: Optional[int] = None) -> int:
         """Computes a hash value corresponding to the contents of a block and
         the contents of the preceding block(s). The hash value is used for
         prefix caching.
 
-        NOTE: Content-based hashing does not yet support LoRA.
-
         Parameters:
         - is_first_block (bool): A flag indicating if the block is the first in
             the sequence.
@@ -860,12 +880,15 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
             if this is the first block.
         - cur_block_token_ids (List[int]): A list of token ids in the current
             block. The current block is assumed to be full.
+        - extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
 
         Returns:
         - int: The computed hash value for the block.
         """
         assert (prev_block_hash is None) == is_first_block
-        return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
+        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
+                     extra_hash))
 
 
 class ComputedBlocksTracker:
@@ -935,12 +958,18 @@ def _update_seq_hashes(self, seq: Sequence) -> None:
             assert len(token_ids) >= (i + 1) * self._block_size
             block_token_ids = token_ids[i * self._block_size:(i + 1) *
                                         self._block_size]
+
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
             # This has to be kept in sync with the allocator's hash
             # calculation.
             block_hash = PrefixCachingBlock.hash_block_tokens(
                 is_first_block=prev_block_hash is None,
                 prev_block_hash=prev_block_hash,
                 cur_block_token_ids=block_token_ids,
+                extra_hash=extra_hash,
             )
             block_hashes_recorded.append(block_hash)
             prev_block_hash = block_hash
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 209487c6b4f9e..b41e848221882 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -151,8 +151,13 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
             max_block_sliding_window=self.max_block_sliding_window,
         )
         if seq.get_token_ids():
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
             # Add blocks to the block table only if the sequence is non empty.
-            block_table.allocate(seq.get_token_ids())
+            block_table.allocate(token_ids=seq.get_token_ids(),
+                                 extra_hash=extra_hash)
 
         return block_table
 
@@ -238,6 +243,7 @@ def append_slots(
             token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
             num_lookahead_slots=num_lookahead_slots,
             num_computed_slots=seq.data.get_num_computed_tokens(),
+            extra_hash=seq.extra_hash(),
         )
         # Return any new copy-on-writes.
         new_cows = self.block_allocator.clear_copy_on_writes()
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index ed7e06cab2996..44adc4158abec 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,6 +1,7 @@
 import enum
+import heapq
 from abc import ABC, abstractmethod
-from typing import OrderedDict, Tuple
+from typing import Dict, List, Tuple
 
 
 class EvictionPolicy(enum.Enum):
@@ -75,8 +76,14 @@ class LRUEvictor(Evictor):
     highest num_hashed_tokens value, then one will be chose arbitrarily
     """
 
+    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
+    # queue relative to the free table size. When this threshold is exceeded,
+    # a cleanup operation is triggered to reduce memory usage.
+    CLEANUP_THRESHOLD = 50
+
     def __init__(self):
-        self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict()
+        self.free_table: Dict[int, BlockMetaData] = {}
+        self.priority_queue = []
 
     def __contains__(self, block_id: int) -> bool:
         return block_id in self.free_table
@@ -85,34 +92,50 @@ def evict(self) -> Tuple[int, int]:
         if len(self.free_table) == 0:
             raise ValueError("No usable cache memory left")
 
-        evicted_block, evicted_block_id = None, None
-        # The blocks with the lowest timestamps should be placed consecutively
-        # at the start of OrderedDict. Loop through all these blocks to
-        # find the one with maximum number of hashed tokens.
-        for _id, block in self.free_table.items():
-            if evicted_block is None:
-                evicted_block, evicted_block_id = block, _id
-                continue
-            if evicted_block.last_accessed < block.last_accessed:
-                break
-            if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
-                evicted_block, evicted_block_id = block, _id
-
-        assert evicted_block is not None
-        assert evicted_block_id is not None
-        self.free_table.pop(evicted_block_id)
-
-        return evicted_block_id, evicted_block.content_hash
+        while self.priority_queue:
+            # We do not remove outdated entries from the priority queue at the
+            # time of updating the last_accessed timestamp. Instead, outdated
+            # entries are filtered out here during eviction. Outdated entries
+            # would either not in the free table, or have older last accessed
+            # time.
+            last_accessed, _, block_id, content_hash = heapq.heappop(
+                self.priority_queue)
+            if (block_id in self.free_table and
+                    self.free_table[block_id].last_accessed == last_accessed):
+                self.free_table.pop(block_id)
+                return block_id, content_hash
+
+        raise ValueError("No usable cache memory left")
 
     def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
             last_accessed: float):
         self.free_table[block_id] = BlockMetaData(content_hash,
                                                   num_hashed_tokens,
                                                   last_accessed)
+        heapq.heappush(
+            self.priority_queue,
+            (last_accessed, -num_hashed_tokens, block_id, content_hash))
+        self._cleanup_if_necessary()
 
     def update(self, block_id: int, last_accessed: float):
         self.free_table[block_id].last_accessed = last_accessed
 
+    def _cleanup_if_necessary(self):
+        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
+                self.free_table):
+            self._cleanup()
+
+    def _cleanup(self):
+        new_priority_queue: List[Tuple[float, int, int, int]] = []
+
+        for block_id, block in self.free_table.items():
+            new_priority_queue.append(
+                (block.last_accessed, -block.num_hashed_tokens, block_id,
+                 block.content_hash))
+        heapq.heapify(new_priority_queue)
+
+        self.priority_queue = new_priority_queue
+
     def remove(self, block_id: int):
         if block_id not in self.free_table:
             raise ValueError(
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 9a2d8918d96e5..9f97b0f01ad8a 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -5,7 +5,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
@@ -15,6 +15,7 @@
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils import get_ip, get_open_port, is_valid_ipv6_address
 
@@ -476,13 +477,19 @@ def broadcast_object(self, obj=None):
             return self.dequeue()
 
     @staticmethod
-    def create_from_process_group(pg: ProcessGroup,
+    def create_from_process_group(pg: Union[ProcessGroup,
+                                            StatelessProcessGroup],
                                   max_chunk_bytes,
                                   max_chunks,
                                   writer_rank=0) -> "MessageQueue":
-        group_rank = dist.get_rank(pg)
-        group_world_size = dist.get_world_size(pg)
-        global_ranks = dist.get_process_group_ranks(pg)
+        if isinstance(pg, ProcessGroup):
+            group_rank = dist.get_rank(pg)
+            group_world_size = dist.get_world_size(pg)
+            global_ranks = dist.get_process_group_ranks(pg)
+        else:
+            group_rank = pg.rank
+            group_world_size = pg.world_size
+            global_ranks = list(range(pg.world_size))
 
         from vllm.distributed.parallel_state import in_the_same_node_as
         status = in_the_same_node_as(pg, source_rank=writer_rank)
@@ -500,15 +507,21 @@ def create_from_process_group(pg: ProcessGroup,
                 max_chunks=max_chunks,
             )
             handle = buffer_io.export_handle()
-            dist.broadcast_object_list([handle],
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
+            if isinstance(pg, ProcessGroup):
+                dist.broadcast_object_list([handle],
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+            else:
+                pg.broadcast_obj(handle, writer_rank)
         else:
-            recv = [None]
-            dist.broadcast_object_list(recv,
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
-            handle = recv[0]  # type: ignore
+            if isinstance(pg, ProcessGroup):
+                recv = [None]
+                dist.broadcast_object_list(recv,
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+                handle = recv[0]  # type: ignore
+            else:
+                handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
         buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 015f892cec933..3e2bb436d24b5 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -11,7 +11,8 @@ class KVConnectorFactory:
     @staticmethod
     def create_connector(rank: int, local_rank: int,
                          config: "VllmConfig") -> KVConnectorBase:
-        if config.kv_transfer_config.kv_connector == 'PyNcclConnector':
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
+        if config.kv_transfer_config.kv_connector in supported_kv_connector:
             from .simple_connector import SimpleConnector
             return SimpleConnector(rank, local_rank, config)
         else:
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 5870070a54c75..4ace03ff1184e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -2,7 +2,8 @@
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
 The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
-producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe.
+producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
+MooncakePipe.
 
 But the logic can be extended to support other pipe and lookup buffer.
 """
@@ -15,7 +16,6 @@
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
     SimpleBuffer)
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 
@@ -36,7 +36,27 @@ def __init__(
 
         self.config = config.kv_transfer_config
 
-        logger.info("Initializing PyNcclConfig under kv_transfer_config %s",
+        if self.config.kv_connector == "PyNcclConnector":
+            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
+                PyNcclPipe)
+            logger.info(
+                "Initializing PyNcclConfig under kv_transfer_config %s",
+                self.config)
+        elif self.config.kv_connector == "MooncakeConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_distributed_pipe = os.getenv(
+                'MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_distributed_pipe:
+                raise ValueError(
+                    "To use MooncakeConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import (  # noqa: E501
+                    MooncakePipe)
+                logger.info(
+                    "Initializing MooncakeConfig under kv_transfer_config %s",
                     self.config)
 
         self.lookup_buffer_size = self.config.kv_buffer_size
@@ -44,6 +64,11 @@ def __init__(
         self.producer_buffer: Optional[SimpleBuffer] = None
         self.consumer_buffer: Optional[SimpleBuffer] = None
 
+        self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+
         # 2 pipes for every rank in the world
         port_offset_base = 2 * rank
 
@@ -51,17 +76,26 @@ def __init__(
         # and the decode vLLM only uses recv pipe
         if self.config.is_kv_producer:
 
-            self.producer_data_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base,
-            )
-            self.producer_signal_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base + 1,
-                device="cpu",
-            )
+            if self.config.kv_connector == "PyNcclConnector":
+                self.producer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.producer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.producer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                # We only need to initialize MooncakePipe once
+                self.producer_signal_pipe = self.producer_data_pipe
+
             self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
                                                 self.producer_data_pipe,
                                                 self.config.kv_buffer_size)
@@ -70,17 +104,25 @@ def __init__(
 
             # the current vLLM instance is KV consumer, so it needs to connect
             # its recv pipe to the send pipe of KV producder
-            self.consumer_data_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base,
-            )
-            self.consumer_signal_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base + 1,
-                device="cpu",
-            )
+            if self.config.kv_connector == "PyNcclConnector":
+                self.consumer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.consumer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.consumer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                self.consumer_signal_pipe = self.consumer_data_pipe
+
             self.consumer_buffer = SimpleBuffer(
                 self.consumer_signal_pipe,
                 self.consumer_data_pipe,
@@ -118,6 +160,12 @@ def send_kv_caches_and_hidden_states(
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
 
+        model_config = model_executable.model.config
+        num_heads = model_config.num_key_value_heads
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+        head_size = int(hidden_size / num_attention_heads)
+
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
         # FIXME(Kuntai): This assume that all requests are prefill.
@@ -131,8 +179,6 @@ def send_kv_caches_and_hidden_states(
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
 
-                _, _, num_heads, head_size = kv_cache[0].shape
-
                 key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
                 value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
 
@@ -256,6 +302,11 @@ def recv_kv_caches_and_hidden_states(
 
     def close(self):
         self.producer_data_pipe.close()
-        self.producer_signal_pipe.close()
         self.consumer_data_pipe.close()
-        self.consumer_signal_pipe.close()
+        if self.config.kv_connector == "PyNcclConnector":
+            self.producer_signal_pipe.close()
+            self.consumer_signal_pipe.close()
+        elif self.config.kv_connector == "MooncakeConnector":
+            # MooncakePipe reuses data_pipe for signal_pipe, so we only have to
+            # close the data_pipe.
+            pass
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
new file mode 100644
index 0000000000000..8e4358672b74d
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -0,0 +1,272 @@
+import json
+import os
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import zmq
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+NONE_INT = -150886311
+
+
+@dataclass
+class MooncakeTransferEngineConfig:
+    prefill_url: str
+    decode_url: str
+    metadata_backend: Union[str, None]
+    metadata_server: str
+    protocol: str
+    device_name: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeTransferEngineConfig(
+            prefill_url=config.get("prefill_url"),
+            decode_url=config.get("decode_url"),
+            metadata_backend=config.get("metadata_backend", None),
+            metadata_server=config.get("metadata_server"),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeTransferEngineConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeTransferEngineConfig.from_file(config_file_path)
+
+
+class MooncakeTransferEngine:
+    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
+
+    def __init__(self, kv_rank: int, local_rank: int):
+        try:
+            import mooncake_vllm_adaptor as mva
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        self.engine = mva.mooncake_vllm_adaptor()
+        self.local_rank = local_rank
+
+        try:
+            self.config = MooncakeTransferEngineConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+        except ValueError as e:
+            logger.error(e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
+        decode_host, base_decode_port = self.config.decode_url.split(':')
+
+        # Avoid ports conflict when running prefill and decode on the same node
+        if prefill_host == decode_host and \
+                base_prefill_port == base_decode_port:
+            base_decode_port = str(int(base_decode_port) + 100)
+
+        prefill_port = int(base_prefill_port) + self.local_rank
+        decode_port = int(base_decode_port) + self.local_rank
+        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
+        self.decode_url = ':'.join([decode_host, str(decode_port)])
+
+        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
+                        self.config.metadata_server, self.config.protocol,
+                        self.config.device_name, self.config.metadata_backend)
+
+        self.remote_url = (self.decode_url
+                           if kv_rank == 0 else self.prefill_url)
+
+        # Initialize ZeroMQ context and sockets
+        self.context = zmq.Context()  # type: ignore[attr-defined]
+        self.sender_socket = self.context.socket(zmq.constants.PUSH)
+        self.receiver_socket = self.context.socket(zmq.constants.PULL)
+        self.sender_ack = self.context.socket(zmq.constants.PULL)
+        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
+
+        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
+        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
+                                     decode_host, base_decode_port)
+
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
+                                d_host: str, d_port: str) -> None:
+        """Set up ZeroMQ sockets for sending and receiving data."""
+        # Offsets < 8 are left for initialization in case tp and pp are enabled
+        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
+        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
+        if kv_rank == 0:
+            self.sender_socket.bind(f"tcp://*:{p_rank_offset + 1}")
+            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
+            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
+            self.receiver_ack.bind(f"tcp://*:{p_rank_offset + 2}")
+        else:
+            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
+            self.sender_socket.bind(f"tcp://*:{d_rank_offset + 1}")
+            self.receiver_ack.bind(f"tcp://*:{d_rank_offset + 2}")
+            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
+
+    def initialize(self, local_hostname: str, metadata_server: str,
+                   protocol: str, device_name: str,
+                   metadata_backend: Union[str, None]) -> None:
+        """Initialize the mooncake instance."""
+        if metadata_backend is None:
+            self.engine.initialize(local_hostname, metadata_server, protocol,
+                                   device_name)
+        else:
+            supported_backend = ["etcd", "redis"]
+            metadata_backend = metadata_backend.lower()
+            if metadata_backend not in supported_backend:
+                raise ValueError(
+                    "Mooncake Configuration error. `metadata_backend`"
+                    f"should be one of {supported_backend}.")
+
+            self.engine.initializeExt(local_hostname, metadata_server,
+                                      protocol, device_name, metadata_backend)
+
+    def allocate_managed_buffer(self, length: int) -> int:
+        """Allocate a managed buffer of the specified length."""
+        ret = self.engine.allocateManagedBuffer(length)
+        if ret <= 0:
+            logger.error("Allocation Return Error")
+            raise Exception("Allocation Return Error")
+        return ret
+
+    def free_managed_buffer(self, buffer: int, length: int) -> int:
+        """Free a previously allocated managed buffer."""
+        return self.engine.freeManagedBuffer(buffer, length)
+
+    def transfer_sync(self, buffer: int, peer_buffer_address: int,
+                      length: int) -> int:
+        """Synchronously transfer data to the specified address."""
+        ret = self.engine.transferSync(self.remote_url, buffer,
+                                       peer_buffer_address, length)
+        if ret < 0:
+            logger.error("Transfer Return Error")
+            raise Exception("Transfer Return Error")
+        return ret
+
+    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
+                              length: int) -> int:
+        """Write bytes to the allocated buffer."""
+        return self.engine.writeBytesToBuffer(buffer, user_data, length)
+
+    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
+        """Read bytes from the allocated buffer."""
+        return self.engine.readBytesFromBuffer(buffer, length)
+
+    def wait_for_ack(self, src_ptr: int, length: int) -> None:
+        """Asynchronously wait for ACK from the receiver."""
+        ack = self.sender_ack.recv_pyobj()
+        if ack != b'ACK':
+            logger.error("Failed to receive ACK from the receiver")
+
+        self.free_managed_buffer(src_ptr, length)
+
+    def send_bytes(self, user_data: bytes) -> None:
+        """Send bytes to the remote process."""
+        length = len(user_data)
+        src_ptr = self.allocate_managed_buffer(length)
+        self.write_bytes_to_buffer(src_ptr, user_data, length)
+        self.sender_socket.send_pyobj((src_ptr, length))
+        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
+
+    def recv_bytes(self) -> bytes:
+        """Receive bytes from the remote process."""
+        src_ptr, length = self.receiver_socket.recv_pyobj()
+        dst_ptr = self.allocate_managed_buffer(length)
+        self.transfer_sync(dst_ptr, src_ptr, length)
+        ret = self.read_bytes_from_buffer(dst_ptr, length)
+
+        # Buffer cleanup
+        self.receiver_ack.send_pyobj(b'ACK')
+        self.free_managed_buffer(dst_ptr, length)
+
+        return ret
+
+
+class MooncakePipe(KVPipeBase):
+    """MooncakeTransferEngine based Pipe implementation."""
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None):
+        """Initialize the mooncake pipe and set related parameters."""
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        self.transfer_engine = MooncakeTransferEngine(self.kv_rank,
+                                                      self.local_rank)
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+    def _select_device(self, device: str) -> torch.device:
+        """Select available device (CUDA or CPU)."""
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def tensor_hash(self, tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        return hash(tensor.data_ptr())
+
+    def _send_impl(self, tensor: torch.Tensor) -> None:
+        """Implement the tensor sending logic."""
+        value_bytes = pickle.dumps(tensor)
+        self.transfer_engine.send_bytes(value_bytes)
+
+    def _recv_impl(self) -> torch.Tensor:
+        """Implement the tensor receiving logic."""
+        data = self.transfer_engine.recv_bytes()
+        return pickle.loads(data)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send tensor to the target process."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = tensor if tensor is not None else self.none_tensor
+        assert (len(tensor.shape) > 0)
+        self.transport_thread.submit(self._send_impl, tensor)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive tensor from other processes."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = self.transport_thread.submit(self._recv_impl).result()
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
+            return None
+        else:
+            return tensor
+
+    def close(self) -> None:
+        """Cleanup logic when closing the pipe."""
+        self.transfer_engine.sender_socket.close()
+        self.transfer_engine.receiver_socket.close()
+        self.transfer_engine.sender_ack.close()
+        self.transfer_engine.receiver_ack.close()
+        self.transfer_engine.context.term()  # Terminate the ZMQ context
+        logger.info("Closed the transfer engine and cleaned up resources.")
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 34815d7f0aa78..5b9236f8c56b6 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,6 +37,7 @@
 
 import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
@@ -1191,25 +1192,31 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         torch.cuda.empty_cache()
 
 
-def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
+def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
+                        source_rank: int = 0) -> List[bool]:
     """
     This is a collective operation that returns if each rank is in the same node
     as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
     """
-    assert torch.distributed.get_backend(
-        pg) != torch.distributed.Backend.NCCL, (
-            "in_the_same_node_as should be tested with a non-NCCL group.")
-    # local rank inside the group
-    rank = torch.distributed.get_rank(group=pg)
-    world_size = torch.distributed.get_world_size(group=pg)
+    if isinstance(pg, ProcessGroup):
+        assert torch.distributed.get_backend(
+            pg) != torch.distributed.Backend.NCCL, (
+                "in_the_same_node_as should be tested with a non-NCCL group.")
+        # local rank inside the group
+        rank = torch.distributed.get_rank(group=pg)
+        world_size = torch.distributed.get_world_size(group=pg)
+
+        # global ranks of the processes in the group
+        ranks = torch.distributed.get_process_group_ranks(pg)
+    else:
+        rank = pg.rank
+        world_size = pg.world_size
+        ranks = list(range(world_size))
 
     # local tensor in each process to store the result
     is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
 
-    # global ranks of the processes in the group
-    ranks = torch.distributed.get_process_group_ranks(pg)
-
     magic_message = b"magic_message"
     shm = None
 
@@ -1219,17 +1226,21 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
                 # create a shared memory segment
                 shm = shared_memory.SharedMemory(create=True, size=128)
                 shm.buf[:len(magic_message)] = magic_message
-                torch.distributed.broadcast_object_list([shm.name],
-                                                        src=ranks[source_rank],
-                                                        group=pg)
+                if isinstance(pg, ProcessGroup):
+                    torch.distributed.broadcast_object_list(
+                        [shm.name], src=ranks[source_rank], group=pg)
+                else:
+                    pg.broadcast_obj(shm.name, src=source_rank)
                 is_in_the_same_node[rank] = 1
             else:
                 # try to open the shared memory segment
-                recv = [None]
-                torch.distributed.broadcast_object_list(recv,
-                                                        src=ranks[source_rank],
-                                                        group=pg)
-                name = recv[0]
+                if isinstance(pg, ProcessGroup):
+                    recv = [None]
+                    torch.distributed.broadcast_object_list(
+                        recv, src=ranks[source_rank], group=pg)
+                    name = recv[0]
+                else:
+                    name = pg.broadcast_obj(None, src=source_rank)
                 # fix to https://stackoverflow.com/q/62748654/9191338
                 # Python incorrectly tracks shared memory even if it is not
                 # created by the process. The following patch is a workaround.
@@ -1244,12 +1255,23 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
         if shm:
             shm.close()
 
-    torch.distributed.barrier(group=pg)
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.barrier(group=pg)
+    else:
+        pg.barrier()
 
     # clean up the shared memory segment
     with contextlib.suppress(OSError):
         if rank == source_rank and shm:
             shm.unlink()
-    torch.distributed.all_reduce(is_in_the_same_node, group=pg)
 
-    return [x == 1 for x in is_in_the_same_node.tolist()]
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+        aggregated_data = is_in_the_same_node
+    else:
+        aggregated_data = torch.zeros_like(is_in_the_same_node)
+        for i in range(world_size):
+            rank_data = pg.broadcast_obj(is_in_the_same_node, src=i)
+            aggregated_data += rank_data
+
+    return [x == 1 for x in aggregated_data.tolist()]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ba11600b602e4..bd15bab302a82 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -112,9 +112,7 @@ class EngineArgs:
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
-    # NOTE(kzawora): default block size for Gaudi should be 128
-    # smaller sizes still work, but very inefficiently
-    block_size: int = 16 if not current_platform.is_hpu() else 128
+    block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
@@ -143,6 +141,7 @@ class EngineArgs:
     tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
+    mm_cache_preprocessor: bool = False
     enable_lora: bool = False
     enable_lora_bias: bool = False
     max_loras: int = 1
@@ -169,6 +168,7 @@ class EngineArgs:
     enable_chunked_prefill: Optional[bool] = None
 
     guided_decoding_backend: str = 'xgrammar'
+    logits_processor_pattern: Optional[str] = None
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
     speculative_model_quantization: Optional[str] = None
@@ -205,6 +205,7 @@ def __post_init__(self):
         # by user.
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
         # Override max_num_seqs if it's not set by user.
         if self.max_num_seqs is None:
             self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
@@ -373,6 +374,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
             ' parameter.')
+        parser.add_argument(
+            '--logits-processor-pattern',
+            type=nullable_str,
+            default=None,
+            help='Optional regex pattern specifying valid logits processor '
+            'qualified names that can be passed with the `logits_processors` '
+            'extra completion argument. Defaults to None, which allows no '
+            'processors.')
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
@@ -418,7 +427,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len')
+                            'set to max-model-len. On CUDA devices, '
+                            'only block sizes up to 32 are supported. '
+                            'On HPU devices, block size defaults to 128.')
 
         parser.add_argument(
             "--enable-prefix-caching",
@@ -477,11 +488,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='The fraction of GPU memory to be used for the model '
             'executor, which can range from 0 to 1. For example, a value of '
             '0.5 would imply 50%% GPU memory utilization. If unspecified, '
-            'will use the default value of 0.9. This is a global gpu memory '
-            'utilization limit, for example if 50%% of the gpu memory is '
-            'already used before vLLM starts and --gpu-memory-utilization is '
-            'set to 0.9, then only 40%% of the gpu memory will be allocated '
-            'to the model executor.')
+            'will use the default value of 0.9. This is a per-instance '
+            'limit, and only applies to the current vLLM instance.'
+            'It does not matter if you have another vLLM instance running '
+            'on the same GPU. For example, if you have two vLLM instances '
+            'running on the same GPU, you can set the GPU memory utilization '
+            'to 0.5 for each instance.')
         parser.add_argument(
             '--num-gpu-blocks-override',
             type=int,
@@ -593,6 +605,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=json.loads,
             help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
+        parser.add_argument(
+            '--mm-cache-preprocessor',
+            action='store_true',
+            help='If true, then enables caching of the multi-modal '
+            'preprocessor/mapper. Otherwise, the mapper executes each time'
+            ', and for better performance consider enabling frontend process.')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',
@@ -965,9 +983,10 @@ def create_model_config(self) -> ModelConfig:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
+            mm_cache_preprocessor=self.mm_cache_preprocessor,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
-        )
+            logits_processor_pattern=self.logits_processor_pattern)
 
     def create_load_config(self) -> LoadConfig:
         return LoadConfig(
@@ -1010,17 +1029,15 @@ def create_engine_config(self,
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
-        if model_config.is_multimodal_model:
-            if self.enable_prefix_caching:
-                logger.warning(
-                    "--enable-prefix-caching is currently not "
-                    "supported for multimodal models and has been disabled.")
+        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
+                and self.enable_prefix_caching):
+            logger.warning("--enable-prefix-caching is currently not "
+                           "supported for multimodal models in v0 and "
+                           "has been disabled.")
             self.enable_prefix_caching = False
 
         cache_config = CacheConfig(
-            # neuron needs block_size = max_model_len
-            block_size=self.block_size if self.device != "neuron" else
-            (self.max_model_len if self.max_model_len is not None else 0),
+            block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             swap_space=self.swap_space,
             cache_dtype=self.kv_cache_dtype,
@@ -1066,7 +1083,8 @@ def create_engine_config(self,
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and model_config.runner_type != "pooling"):
+                        and model_config.runner_type != "pooling"
+                        and not current_platform.is_rocm()):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "
@@ -1234,11 +1252,14 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         # When no user override, set the default values based on the usage
         # context.
         # TODO(woosuk): Tune the default values for different hardware.
-        if self.max_num_batched_tokens is None:
-            if usage_context == UsageContext.LLM_CLASS:
-                self.max_num_batched_tokens = 8192
-            elif usage_context == UsageContext.OPENAI_API_SERVER:
-                self.max_num_batched_tokens = 2048
+        default_max_num_batched_tokens = {
+            UsageContext.LLM_CLASS: 8192,
+            UsageContext.OPENAI_API_SERVER: 2048,
+        }
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            self.max_num_batched_tokens = default_max_num_batched_tokens[
+                usage_context]
             logger.warning(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, usage_context.value)
@@ -1248,9 +1269,6 @@ def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
         Override the EngineConfig's configs based on the usage context for V1.
         """
         assert envs.VLLM_USE_V1, "V1 is not enabled"
-        if engine_config.model_config.is_multimodal_model:
-            # TODO (ywang96): Enable APC by default when VLM supports it.
-            assert not engine_config.cache_config.enable_prefix_caching
 
 
 @dataclass
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 32396fd10188d..f50e20cf70323 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1065,16 +1065,20 @@ async def generate(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                sampling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                prompt_adapter_request=prompt_adapter_request,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, RequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    sampling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    prompt_adapter_request=prompt_adapter_request,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, RequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def encode(
         self,
@@ -1147,15 +1151,19 @@ async def encode(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                pooling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    pooling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def abort(self, request_id: str) -> None:
         """Abort a request.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 34b3101ec65e5..95e3d53d43b8d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -46,11 +46,10 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
-                           ParallelSampleSequenceGroup, Sequence,
-                           SequenceGroup, SequenceGroupBase,
-                           SequenceGroupMetadata, SequenceGroupOutput,
-                           SequenceStatus)
+from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
+                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
@@ -248,7 +247,7 @@ def __init__(
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: %r,"
+            "Initializing an LLM engine (v%s) with config: %s, "
             "use_cached_outputs=%s, ",
             VLLM_VERSION,
             vllm_config,
@@ -966,9 +965,9 @@ def has_unfinished_requests_for_virtual_engine(
     @staticmethod
     def _process_sequence_group_outputs(
         seq_group: SequenceGroup,
-        outputs: List[EmbeddingSequenceGroupOutput],
+        outputs: List[PoolingSequenceGroupOutput],
     ) -> None:
-        seq_group.embeddings = outputs[0].embeddings
+        seq_group.pooled_data = outputs[0].data
 
         for seq in seq_group.get_seqs():
             seq.status = SequenceStatus.FINISHED_STOPPED
@@ -1784,8 +1783,8 @@ def _get_stats(self,
                                num_prompt_tokens_iter)
         # Spec decode, if enabled, emits specialized metrics from the worker in
         # sampler output.
-        if model_output and (model_output[0].spec_decode_worker_metrics
-                             is not None):
+        if model_output and isinstance(model_output[0], SamplerOutput) and (
+                model_output[0].spec_decode_worker_metrics is not None):
             spec_decode_metrics = model_output[0].spec_decode_worker_metrics
         else:
             spec_decode_metrics = None
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index ea3c93f733038..95da1c6e7b9bf 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -17,11 +17,11 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (FlexibleArgumentParser, iterate_with_cancellation,
-                        random_uuid)
+from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -47,6 +47,11 @@ async def generate(request: Request) -> Response:
     - other fields: the sampling parameters (See `SamplingParams` for details).
     """
     request_dict = await request.json()
+    return await _generate(request_dict, raw_request=request)
+
+
+@with_cancellation
+async def _generate(request_dict: dict, raw_request: Request) -> Response:
     prompt = request_dict.pop("prompt")
     stream = request_dict.pop("stream", False)
     sampling_params = SamplingParams(**request_dict)
@@ -54,8 +59,6 @@ async def generate(request: Request) -> Response:
 
     assert engine is not None
     results_generator = engine.generate(prompt, sampling_params, request_id)
-    results_generator = iterate_with_cancellation(
-        results_generator, is_cancelled=request.is_disconnected)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c2054dcbfce0e..3df08c740d65b 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -13,7 +13,8 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
-                               ChatCompletionContentPartImageParam)
+                               ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartInputAudioParam)
 from openai.types.chat import (
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
 from openai.types.chat import (ChatCompletionContentPartRefusalParam,
@@ -105,6 +106,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
 
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -418,7 +420,7 @@ def _placeholder_str(self, modality: ModalityStr,
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
             if model_type == "ultravox":
-                return "<|reserved_special_token_0|>"
+                return "<|audio|>"
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
@@ -519,6 +521,10 @@ def parse_image(self, image_url: str) -> None:
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str) -> None:
         raise NotImplementedError
@@ -545,6 +551,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio = get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = get_and_parse_video(video_url)
 
@@ -574,6 +589,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio_coro = async_get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio_coro)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = async_get_and_parse_video(video_url)
 
@@ -667,17 +691,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
+_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
+MM_PARSER_MAP: Dict[str,
+                    Callable[[ChatCompletionContentPartParam],
+                             Union[str, Dict[str,str]]]] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    "input_audio":
+    lambda part: _InputAudioParser(part).get("input_audio", {}),
     "refusal":
     lambda part: _RefusalParser(part).get("refusal", ""),
     "video_url":
@@ -686,7 +715,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str, str]:
+        part: ChatCompletionContentPartParam) -> Tuple[str,
+                                                Union[str, Dict[str, str]]]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -717,6 +747,7 @@ def _parse_chat_message_content_mm_part(
         return part_type, content
 
     # Handle missing 'type' but provided direct URL fields.
+    # 'type' is required field by pydantic
     if part_type is None:
         if part.get("image_url") is not None:
             image_params = cast(CustomChatCompletionContentSimpleImageParam,
@@ -726,6 +757,9 @@ def _parse_chat_message_content_mm_part(
             audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
                                 part)
             return "audio_url", audio_params.get("audio_url", "")
+        if part.get("input_audio") is not None:
+            input_audio_params = cast(Dict[str, str], part)
+            return "input_audio", input_audio_params
         if part.get("video_url") is not None:
             video_params = cast(CustomChatCompletionContentSimpleVideoParam,
                                 part)
@@ -739,7 +773,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "audio_url", "video_url")
+                                       "audio_url", "input_audio", "video_url")
 
 
 def _parse_chat_message_content_parts(
@@ -795,7 +829,7 @@ def _parse_chat_message_content_part(
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
 
-    # if part_type is text/refusal/image_url/audio_url/video_url but
+    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
     # content is empty, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
         logger.warning(
@@ -804,18 +838,30 @@ def _parse_chat_message_content_part(
         return None
 
     if part_type in ("text", "refusal"):
-        return {'type': 'text', 'text': content} if wrap_dicts else content
+        str_content = cast(str, content)
+        if wrap_dicts:
+            return {'type': 'text', 'text': str_content}
+        else:
+            return str_content
 
     if part_type == "image_url":
-        mm_parser.parse_image(content)
+        str_content = cast(str, content)
+        mm_parser.parse_image(str_content)
         return {'type': 'image'} if wrap_dicts else None
 
     if part_type == "audio_url":
-        mm_parser.parse_audio(content)
+        str_content = cast(str, content)
+        mm_parser.parse_audio(str_content)
+        return {'type': 'audio'} if wrap_dicts else None
+
+    if part_type == "input_audio":
+        dict_content = cast(Dict[str, str], content)
+        mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "video_url":
-        mm_parser.parse_video(content)
+        str_content = cast(str, content)
+        mm_parser.parse_video(str_content)
         return {'type': 'video'} if wrap_dicts else None
 
     raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -840,7 +886,6 @@ def _parse_chat_message_content(
         content = [
             ChatCompletionContentPartTextParam(type="text", text=content)
         ]
-
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1ec047afb83fe..2042a0ca9f38b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -26,7 +26,9 @@
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
     GuidedDecodingRequest, LLMGuidedOptions)
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput, RequestOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -120,7 +122,7 @@ class LLM:
         serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
     """
 
-    DEPRECATE_LEGACY: ClassVar[bool] = False
+    DEPRECATE_LEGACY: ClassVar[bool] = True
     """A flag to toggle whether to deprecate the legacy generate/encode API."""
 
     DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
@@ -230,6 +232,10 @@ def __init__(
 
         self.request_counter = Counter()
 
+    def __del__(self):
+        if self.llm_engine and hasattr(self.llm_engine, "shutdown"):
+            self.llm_engine.shutdown()
+
     @staticmethod
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
@@ -257,11 +263,14 @@ def generate(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
-        *,
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
+        *,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -275,6 +284,9 @@ def generate(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -288,6 +300,9 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -302,6 +317,9 @@ def generate(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -316,6 +334,9 @@ def generate(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -328,6 +349,9 @@ def generate(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -678,11 +702,12 @@ def encode(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
-        *,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
+        *,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -696,6 +721,7 @@ def encode(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -709,6 +735,7 @@ def encode(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -723,6 +750,7 @@ def encode(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -737,6 +765,7 @@ def encode(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -749,6 +778,7 @@ def encode(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -768,7 +798,8 @@ def encode(
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
-        """Generates the completions for the input prompts.
+        """Apply pooling to the hidden states corresponding to the input
+        prompts.
 
         This class automatically batches the given prompts, considering
         the memory constraint. For the best performance, put all of your prompts
@@ -787,7 +818,7 @@ def encode(
 
         Returns:
             A list of ``PoolingRequestOutput`` objects containing the
-            generated embeddings in the same order as the input prompts.
+            pooled hidden states in the same order as the input prompts.
 
         Note:
             Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
@@ -833,28 +864,110 @@ def encode(
         return self.engine_class.validate_outputs(outputs,
                                                   PoolingRequestOutput)
 
+    def embed(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[EmbeddingRequestOutput]:
+        """
+        Generate an embedding vector for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``EmbeddingRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "embed":
+            raise ValueError(
+                "Embedding API is only enabled for `--task embed`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [EmbeddingRequestOutput.from_base(item) for item in items]
+
+    def classify(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ClassificationRequestOutput]:
+        """
+        Generate class logits for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``ClassificationRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "classify":
+            raise ValueError(
+                "Classification API is only enabled for `--task classify`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [ClassificationRequestOutput.from_base(item) for item in items]
+
     def score(
         self,
         text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
         text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]],
         /,
+        *,
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
-        """Generates similarity scores for all pairs <text,text_pair>.
+    ) -> List[ScoringRequestOutput]:
+        """Generate similarity scores for all pairs ``<text,text_pair>``.
 
-        The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case
-        the text_1 sentence will be replicated N times to pair with the text_2
-        sentences. The input pairs are used to build a list of prompts for the
+        The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
+        In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
+        times to pair with the ``text_2`` sentences.
+        The input pairs are used to build a list of prompts for the
         cross encoder model. This class automatically batches the prompts,
         considering the memory constraint. For the best performance, put all
         of your texts into a single list and pass it to this method.
 
         Args:
             text_1: can be a single prompt or a list of prompts, in which
-                case it has to have the same length as the text_2 list
+                case it has to have the same length as the ``text_2`` list
             text_2: The texts to pair with the query to form the input
                 to the LLM. See :class:`~vllm.inputs.PromptType` for
                 more details about the format of each prompts.
@@ -864,7 +977,7 @@ def score(
                 generation, if any.
 
         Returns:
-            A list of ``PoolingRequestOutput`` objects containing the
+            A list of ``ScoringRequestOutput`` objects containing the
             generated scores in the same order as the input prompts.
         """
         runner_type = self.llm_engine.model_config.runner_type
@@ -884,6 +997,8 @@ def score(
 
         if not self.llm_engine.model_config.is_cross_encoder:
             raise ValueError("Your model does not support cross encoding")
+        if self.llm_engine.model_config.task != "score":
+            raise ValueError("Score API is only enabled for `--task score`")
 
         tokenizer = self.llm_engine.get_tokenizer()
 
@@ -954,8 +1069,10 @@ def ensure_str(prompt: SingletonPrompt):
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
-        return self.engine_class.validate_outputs(outputs,
-                                                  PoolingRequestOutput)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a345f8caeeed2..00e2d1a56f160 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,5 @@
 import asyncio
+import atexit
 import importlib
 import inspect
 import multiprocessing
@@ -58,6 +59,7 @@
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
@@ -196,6 +198,14 @@ async def build_async_engine_client_from_engine_args(
         assert engine_pid is not None, "Engine process failed to start."
         logger.info("Started engine process with PID %d", engine_pid)
 
+        def _cleanup_ipc_path():
+            socket_path = ipc_path.replace("ipc://", "")
+            if os.path.exists(socket_path):
+                os.remove(socket_path)
+
+        # Ensure we clean up the local IPC socket file on exit.
+        atexit.register(_cleanup_ipc_path)
+
         # Build RPCClient, which conforms to EngineClient Protocol.
         engine_config = engine_args.create_engine_config()
         build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
@@ -302,6 +312,7 @@ async def health(raw_request: Request) -> Response:
 
 
 @router.post("/tokenize")
+@with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
@@ -316,6 +327,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
 
 @router.post("/detokenize")
+@with_cancellation
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
@@ -344,6 +356,7 @@ async def show_version():
 
 
 @router.post("/v1/chat/completions")
+@with_cancellation
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
     handler = chat(raw_request)
@@ -364,6 +377,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 
 @router.post("/v1/completions")
+@with_cancellation
 async def create_completion(request: CompletionRequest, raw_request: Request):
     handler = completion(raw_request)
     if handler is None:
@@ -381,6 +395,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
 
 @router.post("/v1/embeddings")
+@with_cancellation
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
@@ -397,7 +412,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/v1/score")
+@router.post("/score")
+@with_cancellation
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
     if handler is None:
@@ -414,6 +430,16 @@ async def create_score(request: ScoreRequest, raw_request: Request):
     assert_never(generator)
 
 
+@router.post("/v1/score")
+@with_cancellation
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly.")
+
+    return await create_score(request, raw_request)
+
+
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 7913f8720ca73..c8132811de903 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -71,7 +71,7 @@ def get_logits_processors(
 
         # Check if token_id is within the vocab size
         for token_id, bias in clamped_logit_bias.items():
-            if token_id < 0 or token_id >= tokenizer.vocab_size:
+            if token_id < 0 or token_id >= len(tokenizer):
                 raise ValueError(f"token_id {token_id} in logit_bias contains "
                                  "out-of-vocab token id")
 
@@ -81,6 +81,6 @@ def get_logits_processors(
     if allowed_token_ids is not None:
         logits_processors.append(
             _get_allowed_token_ids_logits_processor(
-                frozenset(allowed_token_ids), tokenizer.vocab_size))
+                frozenset(allowed_token_ids), len(tokenizer)))
 
     return logits_processors
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ee94a9413f098..5a70e0952666b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,5 +1,6 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import re
 import time
 from argparse import Namespace
 from typing import Any, Dict, List, Literal, Optional, Union
@@ -14,7 +15,7 @@
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
 from vllm.sequence import Logprob
-from vllm.utils import random_uuid
+from vllm.utils import random_uuid, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -148,6 +149,46 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
 
 
+class LogitsProcessorConstructor(BaseModel):
+    qualname: str
+    args: Optional[List[Any]] = None
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+LogitsProcessors = List[Union[str, LogitsProcessorConstructor]]
+
+
+def get_logits_processors(processors: Optional[LogitsProcessors],
+                          pattern: Optional[str]) -> Optional[List[Any]]:
+    if processors and pattern:
+        logits_processors = []
+        for processor in processors:
+            qualname = processor if isinstance(processor,
+                                               str) else processor.qualname
+            if not re.match(pattern, qualname):
+                raise ValueError(
+                    f"Logits processor '{qualname}' is not allowed by this "
+                    "server. See --logits-processor-pattern engine argument "
+                    "for more information.")
+            try:
+                logits_processor = resolve_obj_by_qualname(qualname)
+            except Exception as e:
+                raise ValueError(
+                    f"Logits processor '{qualname}' could not be resolved: {e}"
+                ) from e
+            if isinstance(processor, LogitsProcessorConstructor):
+                logits_processor = logits_processor(*processor.args or [],
+                                                    **processor.kwargs or {})
+            logits_processors.append(logits_processor)
+        return logits_processors
+    elif processors:
+        raise ValueError(
+            "The `logits_processors` argument is not supported by this "
+            "server. See --logits-processor-pattern engine argugment "
+            "for more information.")
+    return None
+
+
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -170,7 +211,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = 0.7
+    temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],
@@ -293,6 +334,17 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
             "through out the inference process and return in response."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-chat-completion-extra-params
 
@@ -314,7 +366,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
@@ -333,7 +387,7 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
                 if self.guided_decoding_backend is None:
-                    self.guided_decoding_backend = "lm-format-enforcer"
+                    self.guided_decoding_backend = "xgrammar"
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,
@@ -364,6 +418,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             min_tokens=self.min_tokens,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             include_stop_str_in_output=self.include_stop_str_in_output,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
@@ -599,6 +655,17 @@ class CompletionRequest(OpenAIBaseModel):
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-completion-extra-params
 
@@ -619,7 +686,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
@@ -665,6 +734,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
@@ -812,10 +883,11 @@ class ScoreRequest(OpenAIBaseModel):
     text_2: Union[List[str], str]
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
 
-    # doc: begin-chat-embedding-pooling-params
+    # doc: begin-score-pooling-params
     additional_data: Optional[Any] = None
-    # doc: end-chat-embedding-pooling-params
+    # doc: end-score-pooling-params
 
+    # doc: begin-score-extra-params
     priority: int = Field(
         default=0,
         description=(
@@ -823,6 +895,8 @@ class ScoreRequest(OpenAIBaseModel):
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."))
 
+    # doc: end-score-extra-params
+
     def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
 
@@ -900,7 +974,7 @@ class EmbeddingResponse(OpenAIBaseModel):
 class ScoreResponseData(OpenAIBaseModel):
     index: int
     object: str = "score"
-    score: Union[List[float], str]
+    score: float
 
 
 class ScoreResponse(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 0af7613a473a4..81bce0dd370bb 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -32,7 +32,6 @@
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
-from vllm.utils import iterate_with_cancellation
 
 logger = init_logger(__name__)
 
@@ -123,6 +122,8 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
+            model_name = self._get_model_name(lora_request)
+
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
             tool_parser = self.tool_parser
@@ -195,7 +196,8 @@ async def create_chat_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
@@ -231,20 +233,16 @@ async def create_chat_completion(
         assert len(generators) == 1
         result_generator, = generators
 
-        if raw_request:
-            result_generator = iterate_with_cancellation(
-                result_generator, raw_request.is_disconnected)
-
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
 
         try:
             return await self.chat_completion_full_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -259,11 +257,11 @@ async def chat_completion_stream_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         chunk_object_type: Final = "chat.completion.chunk"
         first_iteration = True
@@ -496,21 +494,33 @@ async def chat_completion_stream_generator(
 
                         if self._should_check_for_unstreamed_tool_arg_tokens(
                                 delta_message, output) and tool_parser:
+                            latest_delta_len = 0
+                            if ((isinstance(
+                                    delta_message.tool_calls[0].function,
+                                    DeltaFunctionCall)) and isinstance(
+                                        delta_message.tool_calls[0].function.
+                                        arguments, str)):
+                                latest_delta_len = len(
+                                    delta_message.tool_calls[0].function.
+                                    arguments)
+
                             # get the expected call based on partial JSON
                             # parsing which "autocompletes" the JSON
                             expected_call = json.dumps(
                                 tool_parser.prev_tool_call_arr[index].get(
-                                    "arguments", {}))
+                                    "arguments", {}),
+                                ensure_ascii=False)
 
                             # get what we've streamed so far for arguments
                             # for the current tool
                             actual_call = tool_parser.streamed_args_for_tool[
                                 index]
+                            if (latest_delta_len > 0):
+                                actual_call = actual_call[:-latest_delta_len]
 
                             # check to see if there's anything left to stream
                             remaining_call = expected_call.replace(
                                 actual_call, "", 1)
-
                             # set that as a delta message
                             delta_message = DeltaMessage(tool_calls=[
                                 DeltaToolCall(index=index,
@@ -592,12 +602,12 @@ async def chat_completion_full_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         final_res: Optional[RequestOutput] = None
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index ee97d35f2b087..5cf9df92e296e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -85,7 +85,6 @@ async def create_completion(
             return self.create_error_response(
                 "suffix is not currently supported")
 
-        model_name = self.base_model_paths[0].name
         request_id = f"cmpl-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
@@ -124,7 +123,8 @@ async def create_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 request_id_item = f"{request_id}-{i}"
 
@@ -159,9 +159,9 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators, is_cancelled=raw_request.is_disconnected)
+        result_generator = merge_async_iterators(*generators)
 
+        model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 3f7b75e893cad..879276646d2ba 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -18,14 +18,15 @@
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.logger import init_logger
-from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput)
 from vllm.utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
 
 def _get_embedding(
-    output: PoolingOutput,
+    output: EmbeddingOutput,
     encoding_format: Literal["float", "base64"],
 ) -> Union[List[float], str]:
     if encoding_format == "float":
@@ -46,8 +47,10 @@ def request_output_to_embedding_response(
     data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
+        embedding_res = EmbeddingRequestOutput.from_base(final_res)
         prompt_token_ids = final_res.prompt_token_ids
-        embedding = _get_embedding(final_res.outputs, encoding_format)
+
+        embedding = _get_embedding(embedding_res.outputs, encoding_format)
         embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
         data.append(embedding_data)
 
@@ -199,10 +202,7 @@ async def create_embedding(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators,
-            is_cancelled=raw_request.is_disconnected if raw_request else None,
-        )
+        result_generator = merge_async_iterators(*generators)
 
         num_prompts = len(engine_prompts)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 63f27b955461e..5b6a089e4c319 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -31,7 +31,7 @@
                                               ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
-                                              ModelPermission,
+                                              ModelPermission, ScoreRequest,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
                                               UnloadLoraAdapterRequest)
@@ -73,7 +73,7 @@ class LoRAModulePath:
 
 
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
-                              EmbeddingCompletionRequest,
+                              EmbeddingCompletionRequest, ScoreRequest,
                               TokenizeCompletionRequest]
 
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
@@ -567,12 +567,14 @@ async def _get_trace_headers(
         return None
 
     @staticmethod
-    def _base_request_id(raw_request: Request,
+    def _base_request_id(raw_request: Optional[Request],
                          default: Optional[str] = None) -> Optional[str]:
         """Pulls the request id to use from a header, if provided"""
         default = default or random_uuid()
-        return raw_request.headers.get(
-            "X-Request-Id", default) if raw_request is not None else default
+        if raw_request is None:
+            return default
+
+        return raw_request.headers.get("X-Request-Id", default)
 
     @staticmethod
     def _get_decoded_token(logprob: Logprob,
@@ -661,3 +663,16 @@ async def unload_lora_adapter(
 
     def _is_model_supported(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
+
+    def _get_model_name(self, lora: Optional[LoRARequest]):
+        """
+        Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora is not None:
+            return lora.lora_name
+        return self.base_model_paths[0].name
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 4929e720c00e4..101d170bee4d6 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -13,7 +13,7 @@
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
-from vllm.outputs import PoolingRequestOutput
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators
 
@@ -24,13 +24,13 @@ def request_output_to_score_response(
         final_res_batch: List[PoolingRequestOutput], request_id: str,
         created_time: int, model_name: str) -> ScoreResponse:
     data: List[ScoreResponseData] = []
-    score = None
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
-        if final_res is not None:
-            score = final_res.outputs.embedding
-            score_data = ScoreResponseData(index=idx, score=score)
-            data.append(score_data)
+        classify_res = ScoringRequestOutput.from_base(final_res)
+
+        score_data = ScoreResponseData(index=idx,
+                                       score=classify_res.outputs.score)
+        data.append(score_data)
 
     usage = UsageInfo(
         prompt_tokens=num_prompt_tokens,
@@ -186,10 +186,7 @@ async def create_score(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators,
-            is_cancelled=raw_request.is_disconnected if raw_request else None,
-        )
+        result_generator = merge_async_iterators(*generators)
 
         num_prompts = len(engine_prompts)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 18816cd665b3e..869d15ac359ea 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -91,7 +91,8 @@ def extract_tool_calls(
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"])))
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False)))
                     for function_call in raw_function_calls
                 ]
 
@@ -139,13 +140,26 @@ def extract_tool_calls_streaming(
                 self.tool_call_start_token_id)
             cur_tool_end_count = current_token_ids.count(
                 self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
             if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count):
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
                 logger.debug("Generating text content! skipping tool parsing.")
-                if delta_text != self.tool_call_end_token:
-                    return DeltaMessage(content=delta_text)
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
 
             # case: if tool open & close tag counts don't match, we're doing
             # imaginary "else" block here
@@ -184,15 +198,21 @@ def extract_tool_calls_streaming(
 
             # case -- the current tool call is being closed.
             elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count > prev_tool_end_count):
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if (self.prev_tool_call_arr is None
+                        or len(self.prev_tool_call_arr) == 0):
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
                 diff = self.prev_tool_call_arr[self.current_tool_id].get(
                     "arguments")
                 if diff:
                     diff = diff.encode('utf-8').decode(
                         'unicode_escape') if diff is str else diff
-                    diff = json.dumps(
-                        diff, ensure_ascii=False
-                    )[len(self.streamed_args_for_tool[self.current_tool_id]):]
+                    if ('"}' not in delta_text):
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
                     logger.debug(
                         "Finishing tool and found diff that had not "
                         "been streamed yet: %s", diff)
@@ -221,10 +241,15 @@ def extract_tool_calls_streaming(
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
                 return None
+            except json.decoder.JSONDecodeError:
+                logger.debug("unable to parse JSON")
+                return None
 
             # case - we haven't sent the tool name yet. If it's available, send
             #   it. otherwise, wait until it's available.
             if not self.current_tool_name_sent:
+                if (current_tool_call is None):
+                    return None
                 function_name: Union[str, None] = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
@@ -284,13 +309,17 @@ def extract_tool_calls_streaming(
             #   autocompleting the JSON
             elif cur_arguments and not prev_arguments:
 
-                cur_arguments_json = json.dumps(cur_arguments)
+                cur_arguments_json = json.dumps(cur_arguments,
+                                                ensure_ascii=False)
                 logger.debug("finding %s in %s", delta_text,
                              cur_arguments_json)
 
                 # get the location where previous args differ from current
-                args_delta_start_loc = cur_arguments_json.index(delta_text) \
-                                       + len(delta_text)
+                if (delta_text not in cur_arguments_json[:-2]):
+                    return None
+                args_delta_start_loc = cur_arguments_json[:-2]. \
+                                           rindex(delta_text) + \
+                                           len(delta_text)
 
                 # use that to find the actual delta
                 arguments_delta = cur_arguments_json[:args_delta_start_loc]
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 5caac84138e3b..bada805dd35b9 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -19,7 +19,6 @@
     extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -109,7 +108,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(raw_function_call["arguments"])))
+                        arguments=json.dumps(raw_function_call["arguments"],
+                                             ensure_ascii=False)))
                 for raw_function_call in function_call_arr
             ]
 
@@ -199,7 +199,7 @@ def extract_tool_calls_streaming(
                     diff: Union[str, None] = current_tool_call.get("arguments")
 
                     if diff:
-                        diff = json.dumps(diff).replace(
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
                             self.streamed_args_for_tool[self.current_tool_id],
                             "")
                         delta = DeltaMessage(tool_calls=[
@@ -232,7 +232,7 @@ def extract_tool_calls_streaming(
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=f"chatcmpl-tool-{random_uuid()}",
+                                      id=MistralToolCall.generate_random_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
@@ -250,6 +250,8 @@ def extract_tool_calls_streaming(
                 cur_arguments = current_tool_call.get("arguments")
 
                 new_text = delta_text.replace("\'", "\"")
+                if ('"}' in new_text):
+                    new_text = new_text[:new_text.rindex('"}')]
 
                 if not cur_arguments and not prev_arguments:
 
@@ -260,12 +262,15 @@ def extract_tool_calls_streaming(
                         "mid-arguments")
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)[:-2]
                     logger.debug("finding %s in %s", new_text,
                                  cur_arguments_json)
 
+                    if (new_text not in cur_arguments_json):
+                        return None
                     arguments_delta = cur_arguments_json[:cur_arguments_json.
-                                                         index(new_text) +
+                                                         rindex(new_text) +
                                                          len(new_text)]
                     logger.debug("First tokens in arguments received: %s",
                                  arguments_delta)
@@ -279,8 +284,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
                     logger.debug("Searching for diff between \n%s\n%s",
                                  cur_args_json, prev_args_json)
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
new file mode 100644
index 0000000000000..e8a78d216d0f0
--- /dev/null
+++ b/vllm/entrypoints/utils.py
@@ -0,0 +1,57 @@
+import asyncio
+import functools
+
+from fastapi import Request
+
+
+async def listen_for_disconnect(request: Request) -> None:
+    """Returns if a disconnect message is received"""
+    while True:
+        message = await request.receive()
+        if message["type"] == "http.disconnect":
+            break
+
+
+def with_cancellation(handler_func):
+    """Decorator that allows a route handler to be cancelled by client
+    disconnections.
+    
+    This does _not_ use request.is_disconnected, which does not work with
+    middleware. Instead this follows the pattern from 
+    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
+    to wait for an http disconnect message, and the other to do the work that we
+    want done. When the first task finishes, the other is cancelled.
+
+    A core assumption of this method is that the body of the request has already
+    been read. This is a safe assumption to make for fastapi handlers that have
+    already parsed the body of the request into a pydantic model for us.
+    This decorator is unsafe to use elsewhere, as it will consume and throw away
+    all incoming messages for the request while it looks for a disconnect
+    message.
+
+    In the case where a `StreamingResponse` is returned by the handler, this
+    wrapper will stop listening for disconnects and instead the response object
+    will start listening for disconnects.
+    """
+
+    # Functools.wraps is required for this wrapper to appear to fastapi as a
+    # normal route handler, with the correct request type hinting.
+    @functools.wraps(handler_func)
+    async def wrapper(*args, **kwargs):
+
+        # The request is either the second positional arg or `raw_request`
+        request = args[1] if len(args) > 1 else kwargs["raw_request"]
+
+        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
+        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
+
+        done, pending = await asyncio.wait([handler_task, cancellation_task],
+                                           return_when=asyncio.FIRST_COMPLETED)
+        for task in pending:
+            task.cancel()
+
+        if handler_task in done:
+            return handler_task.result()
+        return None
+
+    return wrapper
diff --git a/vllm/envs.py b/vllm/envs.py
index be5d9985b63a4..18870c1c6b51a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,6 +45,7 @@
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -68,8 +69,9 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
-    VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
+    VLLM_DISABLE_COMPILE_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -337,6 +339,13 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                  ),
 
+    # If the env var is set, it enables GPU communication overlap in
+    # Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1"))
+                 ),
+
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
@@ -452,9 +461,11 @@ def get_default_config_root():
 
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))),
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
     "VLLM_LOG_BATCHSIZE_INTERVAL":
     lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
+    "VLLM_DISABLE_COMPILE_CACHE":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index fe475db6d3f57..c4d90f0856f86 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -274,7 +274,20 @@ def write_with_prefix(s: str):
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                       "See https://docs.vllm.ai/en/latest/getting_started/"
+                       "debugging.html#python-multiprocessing "
+                       "for more information.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
 def get_mp_context():
+    _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
 
@@ -284,12 +297,7 @@ def set_multiprocessing_worker_envs(parallel_config):
     in a multiprocessing environment. This should be called by the parent 
     process before worker processes are created"""
 
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+    _check_multiproc_method()
 
     # Configure thread parallelism if OMP_NUM_THREADS isn't set
     #
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4263fb27265f6..4bf5cbbd18ffe 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -414,12 +414,10 @@ def _check_ray_adag_installation(self):
         import pkg_resources
         from packaging import version
 
-        required_version = version.parse("2.35")
+        required_version = version.parse("2.40")
         current_version = version.parse(
             pkg_resources.get_distribution("ray").version)
-        # TODO: update the constraint once we adapt to the backward
-        # incompatible API change from ray 2.36
-        if current_version != required_version:
+        if current_version < required_version:
             raise ValueError(f"Ray version {required_version} is "
                              f"required, but found {current_version}")
 
@@ -445,6 +443,8 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
         logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
         with InputNode() as input_data:
             # Example DAG: PP=2, TP=4
             # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
@@ -480,7 +480,10 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
             forward_dag = MultiOutputNode(outputs)
 
-        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.
+            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
 
     def __del__(self):
         self.shutdown()
@@ -507,8 +510,8 @@ async def execute_model_async(
 
         serialized_data = self.input_encoder.encode(execute_model_req)
         dag_future = await self.forward_dag.execute_async(serialized_data)
-        outputs = await dag_future
-        return self.output_decoder.decode(outputs[0])
+        output = await dag_future[0]
+        return self.output_decoder.decode(output)
 
     async def _driver_execute_model_async(
         self,
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4f28efd639084..426aa1b5c728f 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -277,10 +277,14 @@ def initialize_ray_cluster(
                 f"Total number of devices: {device_bundles}.")
     else:
         num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        # Log a warning message and delay resource allocation failure response.
+        # Avoid immediate rejection to allow user-initiated placement group
+        # created and wait cluster to be ready
         if parallel_config.world_size > num_devices_in_cluster:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group.")
+            logger.warning(
+                "The number of required %ss exceeds the total "
+                "number of available %ss in the placement group.", device_str,
+                device_str)
         # Create a new placement group
         placement_group_specs: List[Dict[str, float]] = ([{
             device_str: 1.0
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index cd136f43c0c57..7f56575279e9b 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,9 +1,11 @@
 import time
-from collections import Counter
+from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Dict, Optional
 
+import torch
+
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -11,9 +13,10 @@
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
-batchsize_counter: Counter = Counter()
 last_logging_time: float = 0
+forward_start_time: float = 0
 batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
+batchsize_forward_time: defaultdict = defaultdict(list)
 
 
 @dataclass
@@ -40,23 +43,10 @@ def set_forward_context(context: Any, vllm_config: VllmConfig):
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
     """
-    global track_batchsize, batchsize_counter
-    global last_logging_time, batchsize_logging_interval
-    if track_batchsize and context is not None:
-        if hasattr(context, "num_prefill_tokens"):
-            # for v0 attention backends
-            batchsize = context.num_prefill_tokens + context.num_decode_tokens
-        else:
-            # for v1 attention backends
-            batchsize = context.num_input_tokens
-        batchsize_counter[batchsize] += 1
-        if time.monotonic() - last_logging_time > batchsize_logging_interval:
-            last_logging_time = time.monotonic()
-            sorted_data = sorted(batchsize_counter.items(),
-                                 key=lambda x: x[1],
-                                 reverse=True)
-            logger.info("Batchsize distribution (batchsize, count): %s",
-                        sorted_data)
+    global forward_start_time
+    need_to_track_batchsize = track_batchsize and context is not None
+    if need_to_track_batchsize:
+        forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
@@ -66,4 +56,37 @@ def set_forward_context(context: Any, vllm_config: VllmConfig):
     try:
         yield
     finally:
+        global batchsize_counter
+        global last_logging_time, batchsize_logging_interval
+        if need_to_track_batchsize:
+            if hasattr(context, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = context.num_prefill_tokens + \
+                    context.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = context.num_input_tokens
+            # we use synchronous scheduling right now,
+            # adding a sync point here should not affect
+            # scheduling of the next batch
+            torch.cuda.synchronize()
+            now = time.perf_counter()
+            # time measurement is in milliseconds
+            batchsize_forward_time[batchsize].append(
+                (now - forward_start_time) * 1000)
+            if now - last_logging_time > batchsize_logging_interval:
+                last_logging_time = now
+                forward_stats = []
+                for bs, times in batchsize_forward_time.items():
+                    if len(times) <= 1:
+                        # can be cudagraph / profiling run
+                        continue
+                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
+                    medium = round(medium, 2)
+                    forward_stats.append((bs, len(times), medium))
+                forward_stats.sort(key=lambda x: x[1], reverse=True)
+                if forward_stats:
+                    logger.info(("Batchsize forward time stats "
+                                 "(batchsize, count, median_time(ms)): %s"),
+                                forward_stats)
         _forward_context = prev_context
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 85aaaa776907f..d54cbb5c37819 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -162,6 +162,11 @@ class TokenInputs(TypedDict):
     Placeholder ranges for the multi-modal data.
     """
 
+    multi_modal_hashes: NotRequired[List[str]]
+    """
+    The hashes of the multi-modal data.
+    """
+
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
@@ -177,6 +182,7 @@ def token_inputs(
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
     multi_modal_inputs: Optional["MultiModalKwargs"] = None,
+    multi_modal_hashes: Optional[List[str]] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
@@ -191,6 +197,8 @@ def token_inputs(
         inputs["multi_modal_data"] = multi_modal_data
     if multi_modal_inputs is not None:
         inputs["multi_modal_inputs"] = multi_modal_inputs
+    if multi_modal_hashes is not None:
+        inputs["multi_modal_hashes"] = multi_modal_hashes
     if multi_modal_placeholders is not None:
         inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
@@ -295,6 +303,18 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
 
         assert_never(inputs)
 
+    @cached_property
+    def multi_modal_hashes(self) -> List[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_hashes", [])
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_hashes", [])
+
+        assert_never(inputs)
+
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         inputs = self.inputs
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0bf1f87444860..82cdfea32a1c3 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -2,7 +2,7 @@
 from collections import UserDict
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
-                    Optional, Protocol, Type, cast)
+                    Optional, Protocol, Type)
 
 from torch import nn
 from transformers import PretrainedConfig, ProcessorMixin
@@ -47,7 +47,6 @@ def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
         Raises:
             TypeError: If the model is not of the specified type.
         """
-
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, hf_config_type):
             raise TypeError("Invalid type of HuggingFace config. "
@@ -60,21 +59,70 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
         """
         Get the HuggingFace image processor configuration of the model.
         """
-
         return self.model_config.hf_image_processor_config
 
+    def get_mm_config(self):
+        """
+        Get the multimodal config of the model.
+
+        Raises:
+            RuntimeError: If the model is not a multimodal model.
+        """
+        mm_config = self.model_config.multimodal_config
+        if mm_config is None:
+            raise RuntimeError("Not a multimodal model")
+
+        return mm_config
+
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
+        return cached_get_processor(
+            self.model_config.model,
+            trust_remote_code=self.model_config.trust_remote_code,
+            **merged_kwargs,
+        )
+
 
 @dataclass(frozen=True)
 class InputProcessingContext(InputContext):
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
-    def get_hf_processor(self, **kwargs) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
         return cached_get_processor(
-            self.model_config.tokenizer,
+            self.model_config.model,
             tokenizer=self.tokenizer,  # Override the tokenizer with ours
             trust_remote_code=self.model_config.trust_remote_code,
-            **kwargs)
+            **merged_kwargs,
+        )
+
+    def resolve_hf_processor_call_kwargs(
+        self,
+        hf_processor: ProcessorMixin,
+        inference_kwargs: Mapping[str, object],
+    ) -> Mapping[str, object]:
+        assert callable(hf_processor)
+
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        return resolve_mm_processor_kwargs(
+            base_kwargs,
+            inference_kwargs,
+            hf_processor,
+        )
 
 
 N = TypeVar("N", bound=Type[nn.Module])
@@ -171,7 +219,8 @@ def register_dummy_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
+            if self._dummy_factories_by_model_type.contains(model_cls,
+                                                            strict=True):
                 logger.warning(
                     "Model class %s already has dummy data "
                     "registered to %s. It is overwritten by the new one.",
@@ -195,7 +244,8 @@ def register_dummy_encoder_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_encoder_factories_by_model_type:
+            if self._dummy_encoder_factories_by_model_type.contains(
+                    model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has dummy encoder data "
                     "registered to %s. It is overwritten by the new one.",
@@ -305,7 +355,8 @@ def register_input_processor(self, processor: InputProcessor):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors_by_model_type:
+            if self._input_processors_by_model_type.contains(model_cls,
+                                                             strict=True):
                 logger.warning(
                     "Model class %s already has input processor "
                     "registered to %s. It is overwritten by the new one.",
@@ -357,7 +408,7 @@ def process_input(self, model_config: "ModelConfig",
         # If it's empty, it'll fall back to the default kwarg values
         mm_processor_kwargs = resolve_mm_processor_kwargs(
             model_config.mm_processor_kwargs,
-            cast(Dict[str, Any], inputs.get("mm_processor_kwargs")),
+            inputs.get("mm_processor_kwargs", {}),  # type: ignore
             processor,
         )
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 38cb846578d5c..a6c93a3d8bfe9 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -30,6 +30,7 @@
     LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.lora.punica_wrapper import PunicaWrapperBase
@@ -1068,6 +1069,11 @@ def _get_logits(
         ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
                                                       posinf=float("inf"),
                                                       neginf=float("-inf")))
+
+        # HPU needs special handling to prune out dummy samples.
+        if current_platform.is_hpu():
+            lora_logits = lora_logits[:logits.shape[0], :]
+
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 0a5a84bdd8deb..b9ec0c4bc6323 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -63,7 +63,7 @@ def add_expand(
         lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
         output_slices: Tuple[int, ...],
         offset_start: int = 0,
-        add_input=True,
+        add_inputs=True,
         **kwargs,
     ) -> None:
         """
@@ -77,7 +77,7 @@ def add_lora_embedding(
         y: torch.Tensor,
         x: torch.Tensor,
         lora_b_stacked: torch.Tensor,
-        add_input: bool = True,
+        add_inputs: bool = True,
         **kwargs,
     ) -> None:
         """
@@ -367,12 +367,13 @@ def add_expand(self,
                    lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
                    output_slices: Tuple[int, ...],
                    offset_start: int = 0,
-                   add_input=True,
+                   add_inputs=True,
                    **kwargs) -> None:
         """
         Performs GEMM and bias addition for multiple slices of lora_b.
       
         Semantics:
+            offset = offset_start
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
                 y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
@@ -386,7 +387,8 @@ def add_expand(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_input (bool):  Defaults to True.
+            offset_start (int): The starting position of y, defaults to 0
+            add_inputs (bool):  Defaults to True.
 
         """
         # TODO: implement it based on torch ops
@@ -397,7 +399,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
@@ -409,7 +411,7 @@ def add_lora_embedding(self,
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_b_stacked (torch.Tensor): lora_b's weights.
-            add_input (bool): Default to True.
+            add_inputs (bool): Default to True.
         """
         # TODO: implement it based on torch ops
         raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index b2af29de129ce..de378df8b3cfa 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -67,7 +67,7 @@ def _expand_prefill(
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
-        add_input: bool,
+        add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
@@ -77,7 +77,7 @@ def _expand_prefill(
             w_t_all,
             y,
             *self.prefill_metadata,
-            add_input,
+            add_inputs,
         )
 
     def _expand_decode(
@@ -85,9 +85,9 @@ def _expand_decode(
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
-        add_input: bool,
+        add_inputs: bool,
     ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
 
     def _expand_slice_prefill(
         self,
@@ -96,7 +96,7 @@ def _expand_slice_prefill(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool,
+        add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
@@ -108,7 +108,7 @@ def _expand_slice_prefill(
             *self.prefill_metadata,
             y_offset,
             y_slice_size,
-            add_input,
+            add_inputs,
         )
 
     def _expand_slice_decode(
@@ -118,10 +118,10 @@ def _expand_slice_decode(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool,
+        add_inputs: bool,
     ):
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_input)
+                          y_slice_size, add_inputs)
 
     def _apply_expand(
         self,
@@ -130,7 +130,7 @@ def _apply_expand(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool = True,
+        add_inputs: bool = True,
     ):
         """
         Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
@@ -141,7 +141,7 @@ def _apply_expand(
         expand_slice_fun: Callable = (self._expand_slice_prefill
                                       if self.is_prefill else
                                       self._expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
 
     def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
                       w_t_all: torch.Tensor, scale: float):
@@ -194,7 +194,7 @@ def add_expand(self,
                    lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
                    output_slices: Tuple[int, ...],
                    offset_start: int = 0,
-                   add_input=True,
+                   add_inputs=True,
                    **kwargs) -> None:
         """
         Performs GEMM and bias addition for multiple slices of lora_b.
@@ -213,7 +213,7 @@ def add_expand(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_input (bool):  Defaults to True.
+            add_inputs (bool):  Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
@@ -228,7 +228,7 @@ def add_expand(self,
                 lora_b_stacked[slice_idx],
                 offset_left,
                 output_slices[slice_idx],
-                add_input=add_input,
+                add_inputs=add_inputs,
             )
             offset_left += output_slices[slice_idx]
         y = y.view_as(y_org)
@@ -237,7 +237,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
@@ -249,13 +249,13 @@ def add_lora_embedding(self,
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_b_stacked (torch.Tensor): lora_b's weights.
-            add_input (bool): Default to True.
+            add_inputs (bool): Default to True.
         """
 
         # Embedding layer only need expand op
         expand_fun: Callable = (self._expand_prefill
                                 if self.is_prefill else self._expand_decode)
-        expand_fun(y, x, lora_b_stacked, add_input)
+        expand_fun(y, x, lora_b_stacked, add_inputs)
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -311,7 +311,7 @@ def add_lora_linear(self,
                         lora_b_stacked,
                         None,
                         output_slices,
-                        add_input=True,
+                        add_inputs=True,
                         **kwargs)
 
     def add_lora_logits(self,
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
new file mode 100644
index 0000000000000..d9c4f44a1c282
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -0,0 +1,87 @@
+from typing import Optional, Tuple, Union, final
+
+import torch
+from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                    dispatch_bgmv_linear)
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperHPU(PunicaWrapperBase):
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        # Increasing max_num_batched_tokens by 3x to handle increase in
+        # tensor size due to padding.
+        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
+                                   max_batches, device)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+
+        for slice_idx in range(len(output_slices)):
+            dispatch_bgmv_linear(
+                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
+                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index df6c1bdc7dd71..cd64878d95ae3 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -10,5 +10,10 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
         print_info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
+    elif current_platform.is_hpu():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
+        print_info_once("Using PunicaWrapperHPU.")
+        return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 8a7ff38bfeb1a..eb8db882435e6 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,5 +1,6 @@
 import asyncio
 import concurrent.futures
+import os
 from enum import Enum
 from json import dumps as json_dumps
 from re import escape as regex_escape
@@ -48,6 +49,11 @@ class GuidedDecodingMode(Enum):
 
 global_thread_pool = None  # used for generating logits processor fsm
 
+# It's not yet clear that using more provides a benefit, and it could
+# potentially starve other processes on the machine. We'll cap this for now and
+# adjust later if testing proves it to help overcome a bottleneck.
+_MAX_THREADPOOL_WORKERS = 16
+
 
 async def get_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
@@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
         return None
 
     if global_thread_pool is None:
+        max_workers = os.cpu_count() or 2
+        if max_workers > _MAX_THREADPOOL_WORKERS:
+            max_workers = _MAX_THREADPOOL_WORKERS
         global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=2)
+            max_workers=max_workers)
     loop = asyncio.get_running_loop()
 
     return await loop.run_in_executor(global_thread_pool,
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 80e88dd5b4b37..5b97f03257502 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -3,7 +3,7 @@
 
 import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import TYPE_CHECKING, Any
 
 import torch
 from transformers import PreTrainedTokenizerFast
@@ -16,6 +16,7 @@
 
 from vllm.model_executor.guided_decoding.xgrammar_utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -37,11 +38,21 @@ def get_local_xgrammar_guided_decoding_logits_processor(
     return XGrammarLogitsProcessor(config)
 
 
-class TokenizerData(NamedTuple):
+@dataclass(frozen=True)
+class TokenizerData:
     """Immutable container for cached tokenizer data."""
-    encoded_vocab: list[str]
-    stop_token_ids: list[int] | None
-    backend_str: str
+    encoded_vocab: list[str] = field(default_factory=list)
+    stop_token_ids: list[int] | None = None
+    # These fields are mutually exclusive: `backend_str` is used to create a
+    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
+    # used within the constructor of TokenizeInfo
+    backend_str: str | None = None
+    vocab_type: xgr.VocabType | None = None
+
+    def __post_init__(self):
+        # Check for mutual exclusive
+        assert not (self.backend_str and self.vocab_type), \
+            "backend_str and vocab_type are mutual exclusive"
 
 
 class TokenizerDataCache:
@@ -68,18 +79,27 @@ def get_tokenizer_data(cls,
                     "get_vocab method.") from e
 
             stop_token_ids = None
-            backend_str = xgr.VocabType.RAW
+            backend_str = ""
+            vocab_type = xgr.VocabType.RAW
+
+            if stop_token_ids is None and hasattr(
+                    tokenizer,
+                    "eos_token_id") and tokenizer.eos_token_id is not None:
+                stop_token_ids = [tokenizer.eos_token_id]
+
             if isinstance(tokenizer, PreTrainedTokenizerFast):
                 backend_str = tokenizer.backend_tokenizer.to_str()
-                if stop_token_ids is None and hasattr(
-                        tokenizer,
-                        "eos_token_id") and tokenizer.eos_token_id is not None:
-                    stop_token_ids = [tokenizer.eos_token_id]
+                vocab_type = None
+
+            elif isinstance(tokenizer, MistralTokenizer):
+                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type = xgr.VocabType.BYTE_FALLBACK
 
             cls._cache[tokenizer_hash] = TokenizerData(
                 encoded_vocab=encoded_vocab,
                 stop_token_ids=stop_token_ids,
-                backend_str=backend_str)
+                backend_str=backend_str,
+                vocab_type=vocab_type)
 
         return cls._cache[tokenizer_hash]
 
@@ -98,11 +118,30 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
         cache_key = str(config.tokenizer_hash)
 
         if cache_key not in cls._cache:
-            assert config.encoded_vocab is not None
-            tokenizer_info = xgr.TokenizerInfo._create_from_handle(
-                xgr_core.TokenizerInfo.from_huggingface(
-                    config.encoded_vocab, config.backend_str,
-                    config.vocab_size, config.stop_token_ids))
+            assert config.tokenizer_data is not None
+            assert config.tokenizer_data.encoded_vocab is not None
+
+            config_data = config.tokenizer_data
+
+            # In TokenizerDataCache.get_tokenizer_data, a serializable
+            # tokenizer_data is created and cached. This data is used to build
+            # a tokenizer_info and create an xgrammar compiler.
+            # - If tokenizer_data has backend_str set, use
+            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
+            # - Otherwise, use the default constructor with vocab_type.
+            # - xgr_core.TokenizerInfo.from_huggingface !=
+            #   xgr.TokenizerInfo.from_huggingface.
+            if config_data.backend_str:
+                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                    xgr_core.TokenizerInfo.from_huggingface(
+                        config_data.encoded_vocab, config_data.backend_str,
+                        config.vocab_size, config_data.stop_token_ids))
+            else:
+                tokenizer_info = xgr.TokenizerInfo(
+                    config_data.encoded_vocab,
+                    config_data.vocab_type,
+                    vocab_size=config.vocab_size,
+                    stop_token_ids=config_data.stop_token_ids)
             cls._cache[cache_key] = xgr.GrammarCompiler(
                 tokenizer_info, max_threads=config.max_threads)
 
@@ -118,10 +157,7 @@ class GrammarConfig:
     grammar_str: str | None = None
     json_object: bool | None = None
     max_threads: int = 8
-    # Only populated if tokenizer_hash not in cache
-    encoded_vocab: list[str] | None = None
-    stop_token_ids: list[int] | None = None
-    backend_str: str | None = None
+    tokenizer_data: TokenizerData | None = None
 
     @classmethod
     def from_guided_params(cls,
@@ -131,29 +167,27 @@ def from_guided_params(cls,
                            max_threads: int = 8) -> GrammarConfig:
 
         tokenizer_hash = hash(tokenizer)
-        # Only get tokenizer data if not already cached
-        if tokenizer_hash in TokenizerDataCache._cache:
-            encoded_vocab = None
-            stop_token_ids = None
-            backend_str = None
-        else:
-            tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
-            encoded_vocab = tokenizer_data.encoded_vocab
-            stop_token_ids = tokenizer_data.stop_token_ids
-            backend_str = tokenizer_data.backend_str
+        tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
                 json_str = json.dumps(guided_params.json)
             else:
                 json_str = guided_params.json
+
+            # Validate the schema and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_json_schema(json_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
             return cls(json_str=json_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.grammar:
             # XGrammar only supports GBNF grammars, so we must convert Lark
             if grammar_is_likely_lark(guided_params.grammar):
@@ -167,21 +201,28 @@ def from_guided_params(cls,
                         f"Conversion error: {str(e)}") from e
             else:
                 grammar_str = guided_params.grammar
+
+            # Validate the grammar and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_ebnf(grammar_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
             return cls(grammar_str=grammar_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.json_object:
-            return cls(json_object=True,
-                       vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+            return cls(
+                json_object=True,
+                vocab_size=model_config.hf_text_config.vocab_size,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
@@ -257,10 +298,14 @@ def __call__(self, input_ids: list[int],
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
         if device_type != "cuda":
-            scores = scores.to("cpu")
+            scores = scores.to("cpu").unsqueeze(0)
+
+        # Note: In this method, if the tensors have different dimensions
+        # on CPU device fails, but on GPU it runs without error. Hence the
+        # unsqueeze above for scores, to match the token bitmask shape
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type)
+            scores = scores.to(device_type).squeeze()
 
         return scores
diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/xgrammar_utils.py
index 12b42245f4e3d..9a0463964de49 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_utils.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_utils.py
@@ -26,15 +26,11 @@ def grammar_is_likely_lark(grammar_str: str) -> bool:
         if not line:
             continue
 
-        # Look for Lark-style rule definitions
-        if ':' in line and '::=' not in line:
-            return True
+        # Look for GBNF rule definition
+        if '::=' in line:
+            return False
 
-        # Look for Lark-specific features
-        if any(pattern in line for pattern in ['?start:', '|', '~']):
-            return True
-
-    return False
+    return True
 
 
 def convert_lark_to_gbnf(grammar_str: str) -> str:
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index e0d42e30ebef3..75bf33dc70a51 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,14 +1,16 @@
 from enum import IntEnum
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from transformers import PretrainedConfig
+from typing_extensions import assert_never
 
 from vllm.config import PoolerConfig
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
-from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
+from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
@@ -22,7 +24,7 @@ class PoolingType(IntEnum):
     MEAN = 4
 
 
-class Pooler(nn.Module):
+class SimplePooler(nn.Module):
     """A layer that pools specific information from hidden states.
 
     This layer does the following:
@@ -35,21 +37,203 @@ class Pooler(nn.Module):
         normalize: Whether to normalize the pooled data.
     """
 
+    @staticmethod
+    def from_pooling_type(
+        pooling_type: PoolingType,
+        *,
+        normalize: bool,
+        softmax: bool,
+        step_tag_id: Optional[int] = None,
+        returned_token_ids: Optional[List[int]] = None,
+    ) -> "SimplePooler":
+        if pooling_type == PoolingType.LAST:
+            assert step_tag_id is None and returned_token_ids is None
+            return LastPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.ALL:
+            assert step_tag_id is None and returned_token_ids is None
+            return AllPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.CLS:
+            assert step_tag_id is None and returned_token_ids is None
+            return CLSPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.MEAN:
+            assert step_tag_id is None and returned_token_ids is None
+            return MeanPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.STEP:
+            return StepPool(normalize=normalize,
+                            softmax=softmax,
+                            step_tag_id=step_tag_id,
+                            returned_token_ids=returned_token_ids)
+
+        assert_never(pooling_type)
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
+        super().__init__()
+
+        self.head = PoolerHead(normalize=normalize, softmax=softmax)
+
+    def get_prompt_lens(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor:
+        return PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        raise NotImplementedError
+
+    def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput:
+        return PoolingSequenceGroupOutput(data)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data)
+        pooled_outputs = [self.build_output(data) for data in pooled_data]
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class CLSPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        first_token_flat_indices = torch.zeros_like(prompt_lens)
+        first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
+        return hidden_states[first_token_flat_indices]
+
+
+class LastPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
+        return hidden_states[last_token_flat_indices]
+
+
+class AllPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len in prompt_lens:
+            pooled_data.append(hidden_states[offset:offset + prompt_len])
+            offset += prompt_len
+
+        return pooled_data
+
+
+class MeanPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        cumsum = torch.cumsum(hidden_states, dim=0)
+        start_indices = torch.cat([
+            torch.tensor([0], device=hidden_states.device),
+            torch.cumsum(prompt_lens[:-1], dim=0)
+        ])
+        end_indices = torch.cumsum(prompt_lens, dim=0)
+        return (cumsum[end_indices - 1] - cumsum[start_indices] +
+                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
+
+
+class StepPool(SimplePooler):
+
     def __init__(
         self,
-        pooling_type: PoolingType,
+        *,
         normalize: bool,
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
     ):
+        super().__init__(normalize=normalize, softmax=softmax)
+
+        self.step_tag_id = step_tag_id
+        self.returned_token_ids = returned_token_ids
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        returned_token_ids = self.returned_token_ids
+        if returned_token_ids is not None and len(returned_token_ids) > 0:
+            hidden_states = hidden_states[:, returned_token_ids]
+
+        step_tag_id = self.step_tag_id
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len, seq_data_i in zip(prompt_lens,
+                                          pooling_metadata.seq_data.values()):
+            pooled_data_i = hidden_states[offset:offset + prompt_len]
+            if step_tag_id is not None:
+                token_ids = torch.tensor(seq_data_i.prompt_token_ids)
+                pooled_data_i = pooled_data_i[token_ids == step_tag_id]
+
+            offset += prompt_len
+            pooled_data.append(pooled_data_i)
+
+        return pooled_data
+
+
+class PoolerHead(nn.Module):
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
         super().__init__()
 
-        self.pooling_type = pooling_type
         self.normalize = normalize
         self.softmax = softmax
-        self.step_tag_id = step_tag_id
-        self.returned_token_ids = returned_token_ids
+
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]):
+        if self.normalize:
+            if isinstance(pooled_data, list):
+                pooled_data = [
+                    F.normalize(data, p=2, dim=1) for data in pooled_data
+                ]
+            else:
+                pooled_data = F.normalize(pooled_data, p=2, dim=1)
+
+        if self.softmax:
+            if isinstance(pooled_data, list):
+                pooled_data = [F.softmax(data, dim=-1) for data in pooled_data]
+            else:
+                pooled_data = F.softmax(pooled_data, dim=-1)
+
+        return pooled_data
+
+
+class Pooler(nn.Module):
 
     @classmethod
     def from_config_with_defaults(
@@ -60,8 +244,8 @@ def from_config_with_defaults(
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
-    ) -> "Pooler":
-        return cls(
+    ) -> SimplePooler:
+        return SimplePooler.from_pooling_type(
             pooling_type=PoolingType[pooler_config.pooling_type]
             if pooler_config.pooling_type is not None else pooling_type,
             normalize=pooler_config.normalize
@@ -75,85 +259,6 @@ def from_config_with_defaults(
             returned_token_ids,
         )
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        """Pools specific information from hidden states based on metadata."""
-
-        prompt_lens = PoolingTensors.from_pooling_metadata(
-            pooling_metadata, hidden_states.device).prompt_lens
-
-        if self.pooling_type is PoolingType.CLS:
-            first_token_flat_indices = torch.zeros_like(prompt_lens)
-            first_token_flat_indices[1:] += torch.cumsum(prompt_lens,
-                                                         dim=0)[:-1]
-            pooled_data = hidden_states[first_token_flat_indices]
-        elif self.pooling_type == PoolingType.LAST:
-            last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
-            pooled_data = hidden_states[last_token_flat_indices]
-        elif self.pooling_type == PoolingType.ALL:
-            offset = 0
-            pooled_data = []
-            for prompt_len in prompt_lens:
-                pooled_data.append(hidden_states[offset:offset + prompt_len])
-                offset += prompt_len
-        elif self.pooling_type == PoolingType.MEAN:
-            # Calculate mean pooling
-            cumsum = torch.cumsum(hidden_states, dim=0)
-            start_indices = torch.cat([
-                torch.tensor([0], device=hidden_states.device),
-                torch.cumsum(prompt_lens[:-1], dim=0)
-            ])
-            end_indices = torch.cumsum(prompt_lens, dim=0)
-            pooled_data = (
-                cumsum[end_indices - 1] - cumsum[start_indices] +
-                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
-        elif self.pooling_type == PoolingType.STEP:
-            returned_token_ids = self.returned_token_ids
-            if returned_token_ids is not None and len(returned_token_ids) > 0:
-                hidden_states = hidden_states[:, returned_token_ids]
-
-            step_tag_id = self.step_tag_id
-
-            offset = 0
-            pooled_data = []
-            for prompt_len, seq_data_i in zip(
-                    prompt_lens, pooling_metadata.seq_data.values()):
-                pooled_data_i = hidden_states[offset:offset + prompt_len]
-                if step_tag_id is not None:
-                    token_ids = torch.tensor(seq_data_i.prompt_token_ids)
-                    pooled_data_i = pooled_data_i[token_ids == step_tag_id]
-
-                offset += prompt_len
-                pooled_data.append(pooled_data_i)
-        else:
-            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
-
-        if self.normalize:
-            if isinstance(pooled_data, list):
-                pooled_data = [
-                    nn.functional.normalize(data, p=2, dim=1)
-                    for data in pooled_data
-                ]
-            else:
-                pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
-
-        if self.softmax:
-            if isinstance(pooled_data, list):
-                pooled_data = [
-                    nn.functional.softmax(data, dim=-1) for data in pooled_data
-                ]
-            else:
-                pooled_data = nn.functional.softmax(pooled_data, dim=-1)
-
-        pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
-        ]
-
-        return PoolerOutput(outputs=pooled_outputs)
-
 
 class CrossEncodingPooler(nn.Module):
     """A layer that pools specific information from hidden states.
@@ -208,9 +313,8 @@ def forward(
         if self.pooler is not None:
             # apply classifier once on the full batch if possible
             pooled_output = self.classifier(pooled_output)
-        logits = self.default_activation_function(pooled_output)
 
-        pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in logits
-        ]
+        scores = self.default_activation_function(pooled_output).squeeze(-1)
+
+        pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e01c713dd14db..5dc872933282c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -145,12 +145,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.44.0":
+            if bitsandbytes.__version__ < "0.45.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.44.0.")
+                                  "install bitsandbytes>=0.45.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.44.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.0 via "
+                              "`pip install bitsandbytes>=0.45.0` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4f5758a42dbbc..0c1fc18228f5c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,9 @@
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, Literal, Optional, cast
 
 import torch
-from compressed_tensors.config import CompressionFormat
+from compressed_tensors.config import (CompressionFormat,
+                                       SparsityCompressionConfig,
+                                       SparsityStructure)
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
@@ -15,7 +17,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
@@ -27,20 +29,29 @@
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self,
-                 target_scheme_map: Dict[str, Any],
-                 ignore: List[str],
-                 quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        config: Optional[Dict[str, Any]] = None,
+    ):
 
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -78,8 +89,50 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            config=config,
+        )
+
+    @classmethod
+    def _sparsity_scheme_map_from_config(
+            cls, config: Dict[str,
+                              Any]) -> Dict[str, SparsityCompressionConfig]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            sparsity compression configurations
+        """
+        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
+            return dict()
+
+        sparsity_config = SparsityCompressionConfig.model_validate(
+            sparsity_config)
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparsity_config.targets or list()
+        }
+        return sparse_scheme_map
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
         target_scheme_map: Dict[str, Any] = dict()
-        ignore = cast(List[str], config.get("ignore"))
         quant_format = cast(str, config.get("format"))
 
         # The quant_config has multiple config_groups, each containing
@@ -90,12 +143,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for _, quant_config in config["config_groups"].items():
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
                 target_scheme_map[target][
-                    "weights"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -110,13 +165,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                             "weights"].type == QuantizationType.FLOAT
                     else:
                         target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.parse_obj(
+                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
                                 quant_config.get("input_activations"))
-
-        return cls(target_scheme_map=target_scheme_map,
-                   ignore=ignore,
-                   quant_format=quant_format,
-                   kv_cache_scheme=config.get("kv_cache_scheme"))
+        return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -315,23 +366,105 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
-        matched_target = find_matched_target(
-            layer_name=layer_name,
-            module=layer,
-            targets=self.target_scheme_map.keys())
 
-        # Find the quant_scheme
-        scheme_dict = self.target_scheme_map[matched_target]
-        scheme = self._get_scheme_from_parts(
-            weight_quant=scheme_dict["weights"],
-            input_quant=scheme_dict["input_activations"])
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys())
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+        elif self.sparsity_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.sparsity_scheme_map.keys())
+            weight_quant = None
+            input_quant = None
+
+        # For models with sparsity, assumes that the sparse layers are also
+        # quantized for cutlass 2:4 support
+        sparsity_scheme: Optional[
+            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
+                matched_target)
+
+        if self.supports_cutlass_24(weight_quant=weight_quant,
+                                    input_quant=input_quant,
+                                    sparsity_scheme=sparsity_scheme):
+            # Have a valid sparsity scheme
+            # Validate layer is supported by Cutlass 2:4 Kernel
+            scheme = CompressedTensors24(quantized=weight_quant is not None
+                                         or input_quant is not None,
+                                         weight_quant=weight_quant,
+                                         input_quant=input_quant)
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
-
         return scheme
 
+    @staticmethod
+    def supports_cutlass_24(
+            weight_quant: Optional[QuantizationArgs],
+            input_quant: Optional[QuantizationArgs],
+            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        is_valid_sparsity = (sparsity_scheme is not None
+                             and sparsity_scheme.sparsity_structure
+                             == SparsityStructure.TWO_FOUR.value
+                             and sparsity_scheme.format == "dense")
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 5d259ec72051c..569ecaa6f5a76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -7,13 +7,12 @@
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
 
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
+
 __all__ = [
-    "CompressedTensorsScheme",
-    "CompressedTensorsWNA16",
-    "CompressedTensorsW8A16Fp8",
-    "CompressedTensorsW4A16Sparse24",
-    "CompressedTensorsW8A8Int8",
-    "CompressedTensorsW8A8Fp8",
-    "WNA16_SUPPORTED_BITS",
-    "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000000000..af266769aef89
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,203 @@
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensors24"]
+
+
+class CompressedTensors24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 quantized: bool = False,
+                 weight_quant: Optional[QuantizationArgs] = None,
+                 input_quant: Optional[QuantizationArgs] = None):
+
+        self.quantized = quantized
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Only cutlass 3.x kernels are implemented so far
+        return 90
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        self.output_dtype = params_dtype
+        layer.logical_widths = output_partition_sizes
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=self.weights_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader)
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader)
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(data=torch.empty(
+                    1, dtype=torch.float32),
+                                                weight_loader=weight_loader)
+
+                layer.register_parameter("input_scale", input_scale)
+
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+
+        layer.register_parameter("weight", weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Compress weights after loading. Store compressed weight and meta
+            tensor
+        
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
+        :param layer: The layer with the weights to be processed
+        
+        """
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths),
+                                                        requires_grad=False)
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4 
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed 
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer 
+        """
+        if self.quantized:
+            scale = None
+            if hasattr(layer, "input_scale"):
+                scale = layer.input_scale
+
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                if scale is not None:
+                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+                else:
+                    q_input, input_scale = ops.scaled_fp8_quant(
+                        x, use_per_token_if_dynamic=True)
+
+        else:
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
+
+        out = ops.cutlass_scaled_sparse_mm(a=q_input,
+                                           bt_nzs=layer.weight,
+                                           bt_meta=layer.meta,
+                                           scale_a=input_scale,
+                                           scale_b=layer.weight_scale,
+                                           out_dtype=self.output_dtype,
+                                           bias=bias)
+        assert out.is_contiguous()
+        return out
+
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+
+        if (self.weight_quant.type == QuantizationType.FLOAT
+                and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+
+        if (self.weight_quant.type == QuantizationType.INT
+                and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+
+        raise ValueError("Quantization type not supported by Cutlass")
+
+
+def check_24(tensor):
+    new_tensor = tensor.view(-1, 4)
+    zero_counts = (new_tensor == 0).sum(dim=1)
+    return (zero_counts >= 2).all().item()
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 85e24ca660686..c846e42f1b0c3 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -48,7 +48,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -171,12 +171,26 @@ def __init__(
             rope_scaling=self.rope_scaling,
             is_neox_style=False,
         )
+
+        sliding_window = getattr(config, "sliding_window", None)
+        # Model v2 has sliding windows, v1 does not
+        self.v1 = sliding_window is None
+
+        layer_idx = extract_layer_index(prefix)
+        layer_has_sliding_window = (
+            getattr(config, "sliding_window_pattern", False)
+            and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
+
+        self.sliding_window = (sliding_window
+                               if layer_has_sliding_window else None)
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
+                              per_layer_sliding_window=self.sliding_window,
                               prefix=f"{prefix}.attn")
         if self.use_qk_norm:
             self.q_norm = LayerNorm(param_shape=(self.num_heads,
@@ -206,7 +220,8 @@ def forward(
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         if self.use_qk_norm:
             q, k = self._apply_qk_norm(q, k)
-        q, k = self.rotary_emb(positions, q, k)
+        if self.v1 or self.sliding_window:
+            q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
         return output
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
new file mode 100644
index 0000000000000..d179d6235424a
--- /dev/null
+++ b/vllm/model_executor/models/gritlm.py
@@ -0,0 +1,248 @@
+from array import array
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+from vllm.attention import AttentionMetadata
+from vllm.attention.backends.xformers import XFormersImpl
+from vllm.config import ModelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.pooler import PoolerHead
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.pooling_metadata import (PoolingMetadata,
+                                                  PoolingTensors)
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
+                           PoolingSequenceGroupOutput)
+
+logger = init_logger(__name__)
+
+
+class GritLMPooler(nn.Module):
+
+    def __init__(self, model_config: ModelConfig):
+        super().__init__()
+
+        self.model_config = model_config
+
+        tokenizer = cached_get_tokenizer(
+            self.model_config.tokenizer,
+            tokenizer_mode=self.model_config.tokenizer_mode,
+            tokenizer_revision=self.model_config.tokenizer_revision,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
+        # Collect the tokens needed for pattern matching.
+        # "▁<" is different from "_<". The former uses "▁" to indicate that
+        # the next token is the start of a word.
+        # "<0x0A>" is the newline token (i.e. "\n")."
+        self.token_ids = {
+            tok: tokenizer.convert_tokens_to_ids([tok])[0]
+            for tok in ["<s>", "▁<", "<", "|", "embed", ">", "<0x0A>", "user"]
+        }
+
+        def tokens_to_ids(tokens: list[str]) -> array:
+            return array("i", [self.token_ids[token] for token in tokens])
+
+        self.user_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "user", "|", ">", "<0x0A>"])
+        self.embed_newline_pattern_ids = tokens_to_ids(
+            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"])
+        self.embed_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "embed", "|", ">", "<0x0A>"])
+
+        self.head = PoolerHead(normalize=True, softmax=False)
+
+    def _find_array(self, arr: array, target: array, start_idx: int) -> int:
+        """
+        Find the first occurrence of target in arr starting from start_idx.
+
+        Args:
+        arr: The array to search within
+        target: The consecutive subsequence to find
+        start_idx: The starting index to search from
+
+        Returns:
+        int: The index of the first occurrence of target in arr.
+        """
+        if start_idx < 0:
+            raise ValueError("start_idx must be non-negative")
+        if not target or not arr:
+            raise ValueError("Empty arr or target not allowed")
+
+        target_len = len(target)
+        for i in range(start_idx, len(arr) - target_len + 1):
+            if arr[i:i + target_len] == target:
+                return i
+        return -1
+
+    def _get_instruction_len(self, prompt_token_ids: array) -> int:
+        """
+        Get the length of the instruction in the prompt.
+
+        We do a pattern matching to find the instruction in the prompt,
+        and then return the length of the instruction.
+
+        The pattern matching is done using integers instead of strings
+        because the prompt is given as a list of token IDs.
+        """
+
+        instruction_len = 0
+
+        # Return no instruction in case of missing BOS token.
+        if prompt_token_ids[0] != self.token_ids["<s>"]:
+            logger.warning("BOS token not found in prompt,"
+                           "thus using empty string for instruction."
+                           "GritLM requires BOS token in prompt.")
+            return instruction_len
+
+        # If user pattern is found in the prompt, that means there should be
+        # a newline token before the embed pattern.
+        embed_pattern_ids = self.embed_pattern_ids
+        if self._find_array(prompt_token_ids,
+                            self.user_pattern_ids,
+                            start_idx=1) == 1:
+            embed_pattern_ids = self.embed_newline_pattern_ids
+
+        # Find the embed pattern in the prompt.
+        found_embed_pattern_idx = self._find_array(prompt_token_ids,
+                                                   embed_pattern_ids,
+                                                   start_idx=1)
+
+        if found_embed_pattern_idx != -1:
+            instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
+        else:
+            logger.warning("Query instruction not found in prompt,"
+                           "thus using BOS token as instruction instead."
+                           "GritLM requires query instruction in prompt.")
+            instruction_len = 1
+
+        return instruction_len
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        """
+        Pool the hidden states by summing the embeddings of
+        non-instruction tokens.
+        """
+        prompts_token_ids = [
+            token_ids.prompt_token_ids_array
+            for _, token_ids in pooling_metadata.seq_data.items()
+        ]
+
+        instruction_lens = torch.tensor(
+            [
+                self._get_instruction_len(prompt_token_ids)
+                for prompt_token_ids in prompts_token_ids
+            ],
+            device=hidden_states.device,
+        )
+
+        prompt_lens = PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+        mask = torch.zeros_like(hidden_states, dtype=torch.bool)
+
+        start_idx = 0
+        for prompt_len, instruction_len in zip(prompt_lens, instruction_lens):
+            end_idx = start_idx + prompt_len
+            mask[start_idx + instruction_len:end_idx] = True
+            start_idx = end_idx
+
+        masked_hidden_states = hidden_states.masked_fill(~mask, 0.0)
+
+        sum_embeddings = torch.zeros(len(prompt_lens),
+                                     hidden_states.size(1),
+                                     device=hidden_states.device)
+
+        start_idx = 0
+        for i, prompt_len in enumerate(prompt_lens):
+            end_idx = start_idx + prompt_len
+            sum_embeddings[i] = masked_hidden_states[start_idx:end_idx].sum(
+                dim=0)
+            start_idx = end_idx
+
+        num_non_instruction_tokens = prompt_lens - instruction_lens
+        mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
+            1)
+
+        pooled_data = self.head(mean_embeddings)
+
+        pooled_outputs = [
+            PoolingSequenceGroupOutput(data) for data in pooled_data
+        ]
+
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class GritLM(LlamaForCausalLM):
+    """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
+
+    The class inherits from LlamaForCausalLM and provides a custom pooling
+    layer.
+
+    The main difference between the pooling layer in GritLM and the one in
+    LlamaForCausalLM is that GritLM ignores the query instruction in the prompt
+    when pooling the hidden states.
+
+    Embedding prompts should be in the following format:
+    - With instruction: "<|user|>\nINSTRUCTION\n<|embed|>\nPROMPT".
+    - Without instruction: "<|embed|>\nPROMPT".
+
+    Generation prompts should be in the following format:
+    - "<|user|>\nPROMPT\n<|assistant|>\n"
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+        self.runner_type = vllm_config.model_config.runner_type
+
+        self._pooler = GritLMPooler(vllm_config.model_config)
+
+        for layer in self.model.layers:
+            if self.runner_type == "pooling" and hasattr(layer, "self_attn"):
+                assert isinstance(layer.self_attn.attn.impl, XFormersImpl), (
+                    "GritLM embedding is only supported by XFormers backend, "
+                    "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        # Change attention to non-causal for pooling tasks.
+        if self.runner_type == "pooling":
+            assert attn_metadata.prefill_metadata.attn_bias is None
+            attn_metadata.prefill_metadata.attn_bias = [
+                BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)
+            ]
+
+        return super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            **kwargs,
+        )
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 42c769f79e202..f4b7e4478c164 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -669,8 +669,11 @@ def _process_image_input(
         image_embeds = self.extract_feature(image_input["data"])
 
         patches_per_image = image_input["patches_per_image"]
+
+        # Only one image in the current batch
         if len(patches_per_image) == 1:
-            image_embeds = image_embeds.unsqueeze(0)
+            image_embeds = image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
             return image_embeds
 
         # NOTE: Image embeddings are split into separate tensors for each image
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6bb4c13ab35df..831db2ae52d74 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -7,7 +7,7 @@
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -420,6 +420,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -433,15 +444,12 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (VllmConfig.get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
 
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
-                *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
         (
             mamba_cache_tensors,
             state_indices_tensor,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 733b1bc7d80ac..2902e6999c2fd 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -169,13 +169,15 @@ def __init__(
         )
 
         if hasattr(config, "interleaved_sliding_window"):
-            if isinstance(config.interleaved_sliding_window, int):
-                sliding_window = config.interleaved_sliding_window
-            elif isinstance(config.interleaved_sliding_window, list):
-                sw_idx = layer_idx % len(config.interleaved_sliding_window)
-                sliding_window = config.interleaved_sliding_window[sw_idx]
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
             else:
-                raise ValueError(f"{type(sliding_window)} is not supported.")
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported.")
         else:
             sliding_window = None
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 65c6bd07bfff0..a2e404cf43238 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -5,10 +5,10 @@
 
 import torch
 import torch.nn as nn
-from PIL.Image import Image
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
                           ProcessorMixin, SiglipVisionConfig)
+from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
 from vllm.attention import AttentionMetadata
@@ -21,11 +21,9 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ModalityProcessingMetadata,
-                                        MultiModalProcessingMetadata,
+                                        MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
@@ -33,7 +31,8 @@
                    get_max_clip_image_tokens)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf,
-                      get_max_pixtral_hf_image_tokens)
+                      get_max_pixtral_hf_image_tokens,
+                      get_pixtral_hf_image_feature_size)
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      get_max_siglip_image_tokens)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
@@ -115,62 +114,7 @@ def get_max_llava_image_tokens(ctx: InputContext):
         raise ValueError(f"Unexpected select feature strategy: {strategy}")
 
 
-def dummy_mm_kwargs_for_llava(ctx: InputProcessingContext,
-                              mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        data = dummy_image_for_clip(vision_config, num_images)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        data = dummy_image_for_siglip(vision_config, num_images)
-    elif isinstance(vision_config, PixtralVisionConfig):
-        data = dummy_image_for_pixtral_hf(vision_config, num_images)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    hf_processor = ctx.get_hf_processor()
-    image_processor = hf_processor.image_processor  # type: ignore
-    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
-    is_pixtral = isinstance(hf_processor, PixtralProcessor)
-
-    return MultiModalKwargs(
-        **hf_inputs,
-        is_pixtral=torch.tensor(is_pixtral),
-    )
-
-
-def create_metadata_for_llava(
-        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    image_token_id = hf_config.image_token_index
-
-    def get_repl_count(
-        mm_items: list[Image],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> int:
-        return get_max_llava_image_tokens(ctx)
-
-    return {
-        "image":
-        ModalityProcessingMetadata(prompt_repls=[
-            PromptReplacement(target=[image_token_id],
-                              repl_unit=[image_token_id],
-                              repl_count=get_repl_count),
-        ]),
-    }
-
-
-class LlavaProcessor(BaseMultiModalProcessor):
-
-    def __init__(self, ctx: InputProcessingContext) -> None:
-        super().__init__(
-            ctx=ctx,
-            metadata=create_metadata_for_llava(ctx),
-        )
+class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
     def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
         if getattr(hf_processor, "__is_patched__", False):
@@ -188,18 +132,72 @@ def preprocess(__self, *args, **kwargs):
 
         hf_processor.__is_patched__ = True  # type: ignore
 
-    def _get_hf_processor(self) -> ProcessorMixin:
+    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
         hf_processor = self.ctx.get_hf_processor()
+        assert isinstance(hf_processor, (LlavaProcessor, PixtralProcessor))
 
         if isinstance(hf_processor, PixtralProcessor):
             self._patch_pixtral_processor(hf_processor)
 
         return hf_processor
 
-    def _get_dummy_mm_kwargs(
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        image_token_id = hf_config.image_token_index
+
+        processor = self._get_hf_processor()
+        if isinstance(processor, PixtralProcessor):
+            image_token = processor.image_token
+            image_break_token = processor.image_break_token
+            image_end_token = processor.image_end_token
+
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+
+            def get_replacement_pixtral(item_idx: int):
+                image_size = mm_items.get_image_size(item_idx)
+                (
+                    num_width_tokens,
+                    num_height_tokens,
+                ) = get_pixtral_hf_image_feature_size(
+                    vision_config,
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+                tokens = ([image_token] * num_width_tokens +
+                          [image_break_token]) * num_height_tokens
+                tokens[-1] = image_end_token
+
+                return "".join(tokens)
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=get_replacement_pixtral,
+                ),
+            ]
+
+        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
+
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
+    ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         vision_config = hf_config.vision_config
         num_images = mm_counts["image"]
@@ -215,14 +213,12 @@ def _get_dummy_mm_kwargs(
             raise NotImplementedError(msg)
 
         hf_processor = self._get_hf_processor()
-        image_processor = hf_processor.image_processor  # type: ignore
-        hf_inputs = image_processor.preprocess(data['image'],
-                                               return_tensors="pt")
-        is_pixtral = isinstance(hf_processor, PixtralProcessor)
+        image_token = hf_processor.image_token
 
-        return MultiModalKwargs(
-            **hf_inputs,
-            is_pixtral=torch.tensor(is_pixtral),
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=data,
+            mm_processor_kwargs={},
         )
 
 
@@ -307,7 +303,7 @@ def init_vision_tower_for_llava(
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -588,7 +584,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loader.load_weights(weights)
 
 
-class MantisProcessor(LlavaProcessor):
+class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def _get_hf_processor(self) -> ProcessorMixin:
         try:
@@ -608,6 +604,6 @@ def _get_hf_processor(self) -> ProcessorMixin:
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(MantisProcessor)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 1f5cd02711899..06c8d9723cd01 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -6,7 +6,7 @@
 from transformers import MambaConfig
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -195,6 +195,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.backbone.get_input_embeddings(input_ids)
@@ -208,15 +219,11 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (VllmConfig.get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
-
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
-                *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
 
         (
             mamba_cache_tensors,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 253e689e50a3b..f9ad0c67adaba 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -105,6 +105,11 @@ def input_processor_for_paligemma(ctx: InputContext,
         orig_prompt_ids.remove(hf_config.image_token_index)
 
     new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+
+    # The PaliGemma 2 tokenizer does not include a starting BOS token
+    if orig_prompt_ids[0] != hf_config.bos_token_id:
+        orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids
+
     new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
 
     # NOTE: Create a defensive copy of the original inputs
@@ -149,7 +154,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             projection_dim=config.vision_config.projection_dim)
 
         self.quant_config = quant_config
-        config.text_config.architectures = ["GemmaForCausalLM"]
+
+        if config.text_config.model_type == "gemma":
+            config.text_config.architectures = ["GemmaForCausalLM"]
+        else:
+            config.text_config.architectures = ["Gemma2ForCausalLM"]
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 3c7854ce388ab..7ab06768ae612 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,13 +32,10 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ModalityProcessingMetadata,
                                         MultiModalDataDict,
-                                        MultiModalProcessingMetadata,
+                                        MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -305,64 +302,17 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-def get_max_phi3v_image_tokens(ctx: InputContext,
-                               *,
-                               num_crops: Optional[int] = None):
-    mm_processor_kwargs = {}
-    if num_crops is not None:
-        mm_processor_kwargs["num_crops"] = num_crops
+def get_max_phi3v_image_tokens(ctx: InputContext) -> int:
+    processor = ctx.get_hf_processor()
+    image_processor = processor.image_processor  # type: ignore
 
-    model_config = ctx.model_config
-    image_processor = cached_get_image_processor(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        **mm_processor_kwargs,
-    )
-
-    num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+    return image_processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
         height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
-    return num_tokens
-
-
-def dummy_mm_kwargs_for_phi3v(ctx: InputProcessingContext,
-                              mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-
-    data = dummy_image_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        num_images,
-        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-    )
-
-    hf_processor = ctx.get_hf_processor()
-    image_processor = hf_processor.image_processor  # type: ignore
-    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
-
-    return MultiModalKwargs(**hf_inputs)
-
-
-def create_metadata_for_phi3v(
-        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
-    return {
-        "image":
-        ModalityProcessingMetadata(prompt_repls=[
-            PromptReplacement(target=[_IMAGE_TOKEN_ID],
-                              repl_unit=[_IMAGE_TOKEN_ID],
-                              repl_count=get_max_phi3v_image_tokens(ctx)),
-        ]),
-    }
-
 
-class Phi3VProcessor(BaseMultiModalProcessor):
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
-        super().__init__(
-            ctx=ctx,
-            metadata=create_metadata_for_phi3v(ctx),
-        )
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
     def _get_hf_processor(
         self,
@@ -389,15 +339,61 @@ def _apply_hf_processor(
         processed_outputs['input_ids'] = token_ids
         return processed_outputs
 
-    def _get_dummy_mm_kwargs(
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+        image_processor = hf_processor.image_processor  # type: ignore
+
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+
+        def get_replacement_phi3v(item_idx: int):
+            image_size = mm_items.get_image_size(item_idx)
+            num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+                width=image_size.width,
+                height=image_size.height,
+            )
+
+            return [_IMAGE_TOKEN_ID] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_phi3v,
+            ) for image_token in image_tokens[:max_images]
+        ]
+
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
-        return dummy_mm_kwargs_for_phi3v(self.ctx, mm_counts)
+    ) -> ProcessorInputs:
+        num_images = mm_counts["image"]
+
+        data = dummy_image_for_clip(
+            CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+            num_images,
+            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        )
+
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text="".join(image_tokens[:num_images]),
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(Phi3VProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 94a4ab882c1a9..f05ea195e043d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -72,7 +72,7 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
     image_token_id = mm_encoder.special_ids.img
 
-    mm_config = ctx.model_config.multimodal_config
+    mm_config = ctx.get_mm_config()
     num_images = mm_config.limit_per_prompt.get("image", 1)
 
     # dummy size
@@ -245,6 +245,11 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
 
+        # If the last split index is the last index in image_tokens, we
+        # ignore it to avoid empty split tensor
+        if split_indices[-1] == len(image_tokens):
+            split_indices = split_indices[:-1]
+
         image_embeds = image_embeds.tensor_split(split_indices.cpu())
         return image_embeds
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4beea4641f5ab..68a2467a813a1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -41,6 +41,7 @@
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
+    "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
@@ -56,6 +57,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
@@ -110,6 +112,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 15e8f2af52cda..22189a517d313 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -37,7 +37,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -345,6 +346,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 if self.config.tie_word_embeddings and "lm_head.weight" in name:
                     continue
                 if is_pp_missing_parameter(name, self):
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ea1e5401d42c0..ebaa8a4c4f38a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,41 +3,39 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union, cast)
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
+from transformers import BatchFeature
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             NestedTensors)
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataDict,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings_from_map)
 
-_AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
 
 
@@ -72,64 +70,18 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
     return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
 
 
-def dummy_seq_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    audio_count: int,
-):
-    audio_length = min(get_ultravox_max_audio_tokens(ctx),
-                       seq_len // audio_count)
+class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
-    return SequenceData.from_prompt_token_counts(
-        (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count),
-        (0, seq_len - audio_length * audio_count)), {
-            "audio":
-            consecutive_placeholder_ranges(num_items=audio_count,
-                                           item_size=audio_length)
-        }
-
-
-def dummy_audio_for_ultravox(
-    ctx: InputContext,
-    audio_count: int,
-):
-    feature_extractor = whisper_feature_extractor(ctx)
-    audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
-    return {"audio": [audio_and_sr] * audio_count}
-
-
-def dummy_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-):
-    audio_count = mm_counts["audio"]
-    seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
-    mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
-
-    return DummyData(seq_data, mm_dict, ranges)
-
-
-def input_mapper_for_ultravox(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalKwargs()
-
-    # If the audio inputs are embeddings, no need for preprocessing
-    if is_list_of(data, torch.Tensor, check="all"):
-        return MultiModalKwargs({"audio_embeds": data})
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input)
-        feature_extractor = whisper_feature_extractor(ctx)
+    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
+        return self._get_hf_processor().audio_processor.feature_extractor
 
+    def _resample_audio(
+        self,
+        audio: np.ndarray,
+        sr: int,
+    ) -> Dict[str, Union[np.ndarray, int]]:
+        # resample audio to the model's sampling rate
+        feature_extractor = self._get_feature_extractor()
         if sr != feature_extractor.sampling_rate:
             try:
                 import librosa
@@ -140,78 +92,92 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)
             sr = feature_extractor.sampling_rate
+        return {"audio": audio, "sampling_rate": sr}
 
-        minimum_audio_length = feature_extractor.n_fft // 2 + 1
-        if len(audio) < minimum_audio_length:
-            # Not enough audio; pad it.
-            audio = np.pad(audio, (0, minimum_audio_length - len(audio)))
-
-        single_audio_features = feature_extractor(
-            audio, sampling_rate=sr, padding="longest",
-            return_tensors="pt")["input_features"]
-
-        # Remove the batch dimension because we're wrapping it in a list.
-        audio_features.append(single_audio_features.squeeze(0))
-
-    return MultiModalKwargs({"audio_features": audio_features})
-
-
-def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "audio" not in multi_modal_data:
-        return inputs
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data or not mm_data.get("audio", None):
+            return super()._apply_hf_processor(prompt, mm_data,
+                                               mm_processor_kwargs)
+
+        audio_data = mm_data["audio"]
+        if not isinstance(audio_data, list):
+            audio_data = [audio_data]
+
+        # Ultravox processor doesn't support multiple inputs,
+        # therefore we need to input text and audio one by one
+        tokenizer = self._get_tokenizer()
+        audio_features, audio_token_len = [], []
+        processed_inputs = {}
+        for audio, sr in audio_data:
+            data = self._resample_audio(audio, sr)
+            processed_inputs = super()._apply_hf_processor(
+                prompt, data, mm_processor_kwargs)
+            prompt = tokenizer.decode(processed_inputs["input_ids"][0],
+                                      skip_special_tokens=False)
+            audio_features.append(
+                processed_inputs.pop("audio_values").squeeze(0))
+            audio_token_len.append(
+                processed_inputs.pop("audio_token_len").item())
+
+        return dict(
+            **processed_inputs,
+            audio_features=audio_features,
+            audio_token_len=audio_token_len,
+        )
 
-    if "multi_modal_placeholders" in inputs and "audio" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
+    def _get_processor_data(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        # Ultravox uses "audio" instead of "audios" as calling keyword
+        processor_data, passthrough_data = super()._get_processor_data(mm_data)
+        if "audios" in processor_data:
+            processor_data["audio"] = processor_data.pop("audios")
+        return processor_data, passthrough_data
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        placeholder = hf_processor.audio_token_replacement
+
+        def get_replacement_ultravox(item_idx: int):
+            audio_token_len = hf_inputs["audio_token_len"][item_idx]
+            return placeholder * audio_token_len
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target="<|audio|>",
+                replacement=get_replacement_ultravox,
+            )
+        ]
 
-    feature_extractor = whisper_feature_extractor(ctx)
-    audios = multi_modal_data["audio"]
-    if not isinstance(audios, list):
-        audios = [audios]
-
-    audio_token_counts = []
-    for audio in audios:
-        if isinstance(audio, torch.Tensor):
-            audio_num_tokens = audio.shape[1]
-            audio_token_counts.append(audio_num_tokens)
-        else:
-            audio_data, sample_rate = audio
-            audio_length = audio_data.shape[0]
-            if sample_rate != feature_extractor.sampling_rate:
-                # Account for resampling.
-                adjustment = feature_extractor.sampling_rate / sample_rate
-                audio_length = math.ceil(adjustment * audio_length)
-
-            feature_extractor_output_length = math.ceil(
-                (audio_length - (feature_extractor.hop_length - 1)) /
-                feature_extractor.hop_length)
-
-            uv_config = ctx.get_hf_config(UltravoxConfig)
-            audio_num_tokens = min(
-                max(
-                    1,
-                    math.ceil(feature_extractor_output_length /
-                              (uv_config.stack_factor * 2))),
-                get_ultravox_max_audio_tokens(ctx))
-            audio_token_counts.append(audio_num_tokens)
-
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN,
-        repeat_count=audio_token_counts,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"audio": ranges})
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+
+        audio_count = mm_counts["audio"]
+        audio = np.zeros(audio_len)
+        data = {"audio": [(audio, sampling_rate)] * audio_count}
+
+        return ProcessorInputs(
+            prompt_text="<|audio|>" * audio_count,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 class StackAudioFrames(nn.Module):
@@ -332,11 +298,9 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_ultravox)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_ultravox_max_audio_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_ultravox)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7a30998543b22..03f972dfa1dd7 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -99,7 +99,7 @@ def register_input_mapper(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_mappers:
+            if self._input_mappers.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
@@ -194,7 +194,7 @@ def register_max_multimodal_tokens(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._max_mm_tokens:
+            if self._max_mm_tokens.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already calculates maximum number of "
                     "tokens in %s. It is overwritten by the new one.",
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index cbed77915008b..6f4438f92328f 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -215,6 +215,9 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
+    mm_hashes: NotRequired[List[str]]
+    """The hashes of the multi-modal data."""
+
     mm_placeholders: MultiModalPlaceholderDict
     """
     For each modality, information about the placeholder tokens in
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 65f65bd3b5b3a..339e193eefe20 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,116 +1,59 @@
 import re
 from abc import ABC, abstractmethod
+from collections import UserDict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import (Any, Dict, Generic, NamedTuple, Optional, Protocol,
-                    TypeVar, Union, cast)
+from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
+import numpy as np
 import torch
+from PIL.Image import Image
 from transformers import BatchFeature, ProcessorMixin
-from typing_extensions import TypeAlias, TypedDict
+from typing_extensions import assert_never
 
 from vllm.inputs import DummyData, InputProcessingContext
+from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import (flatten_2d_lists, full_groupby, is_list_of,
-                        resolve_mm_processor_kwargs)
+from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
 
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                      MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
                      VideoItem)
 
+logger = init_logger(__name__)
 
-def bind_prompt_sequence(
-    seq: Union[str, list[int]],
-    tokenizer: AnyTokenizer,
-) -> "_BoundPromptSequence":
-    """
-    Bind a text or token sequence to a tokenizer so that it can be
-    lazily converted into the other format on demand.
-    """
-    return _BoundPromptSequence(
-        tokenizer=tokenizer,
-        _text=seq if isinstance(seq, str) else None,
-        _token_ids=seq if isinstance(seq, list) else None,
-    )
-
-
-_T = TypeVar("_T")
 _S = TypeVar("_S", str, list[int])
+_PromptSeq = Union[str, list[int]]
 
 
 @dataclass
-class PromptReplacement(Generic[_S, _T]):
-    target: _S
-    """The text or token sequence to find and replace."""
+class PromptReplacement:
+    modality: str
+    """The modality for which the replacement is made"""
 
-    repl_unit: _S
-    """
-    The unit making up the replacement text or token sequence.
-    
-    See :code:`repl_count` for more details.
-    """
+    target: _PromptSeq
+    """The text or token sequence to find and replace."""
 
-    repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int]
+    replacement: Union[Callable[[int], _PromptSeq],
+                       _PromptSeq] = field(repr=False)
     """
-    Given the original multi-modal items for this modality, HF-processed data,
-    and index of the processed item, output the number of repetitions of
-    :code:`repl_unit` to build up the replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`, output the
+    replacement text or token sequence.
 
-    For convenience, you can pass in an integer if the number of repetitions is
-    a constant.
+    For convenience, you can pass in the replacement instead of a function
+    if it does not depend on the input.
     """
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(target={self.target!r}, "
-                f"repl_unit={self.repl_unit!r})")
-
-    def bind(
-        self,
-        modality: str,
-        tokenizer: AnyTokenizer,
-    ) -> "_BoundPromptReplacement[_T]":
+    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
         return _BoundPromptReplacement(
-            modality=modality,
-            target=bind_prompt_sequence(self.target, tokenizer),
-            repl_unit=bind_prompt_sequence(self.repl_unit, tokenizer),
-            repl_count=self.repl_count,
+            tokenizer=tokenizer,
+            modality=self.modality,
+            _target=self.target,
+            _replacement=self.replacement,
         )
 
 
-@dataclass
-class ModalityProcessingMetadata(Generic[_T]):
-    prompt_repls: Sequence[Union[PromptReplacement[str, _T],
-                                 PromptReplacement[list[int], _T]]]
-    """
-    Defines each text or token sequence to replace in the HF-processed prompt.
-
-    This is skipped if the HF-processed prompt is found to already contain
-    the replacement prompts.
-    """
-
-
-class MultiModalProcessingMetadataBuiltins(TypedDict, total=False):
-    """Type annotations for modality types predefined by vLLM."""
-
-    image: ModalityProcessingMetadata[ImageItem]
-    video: ModalityProcessingMetadata[VideoItem]
-    audio: ModalityProcessingMetadata[AudioItem]
-
-
-MultiModalProcessingMetadata: TypeAlias = \
-    Mapping[str, ModalityProcessingMetadata[Any]]
-"""
-A dictionary containing an entry for each modality type to process.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that [here](#adding-multimodal-plugin).
-"""
-
-
 def _encode(
     tokenizer: AnyTokenizer,
     text: str,
@@ -185,7 +128,8 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
 
 @dataclass
 class _BoundPromptSequence:
-    tokenizer: AnyTokenizer
+    tokenizer: AnyTokenizer = field(repr=False)
+
     _text: Optional[str]
     _token_ids: Optional[list[int]]
 
@@ -210,38 +154,92 @@ def token_ids(self) -> list[int]:
 
         return self._token_ids
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(_text={self._text!r}, "
-                f"_token_ids={self._token_ids!r})")
-
 
 @dataclass
-class _BoundPromptReplacement(Generic[_T]):
+class _BoundPromptReplacement:
+    tokenizer: AnyTokenizer = field(repr=False)
     modality: str
-    target: _BoundPromptSequence
-    repl_unit: _BoundPromptSequence
-    repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int]
 
-    def get_count(
-        self,
-        mm_items: list[_T],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> int:
-        repl_count = self.repl_count
-        if isinstance(repl_count, int):
-            return repl_count
+    _target: _PromptSeq
+    _replacement: Union[Callable[[int], _PromptSeq],
+                        _PromptSeq] = field(repr=False)
+
+    def __post_init__(self) -> None:
+        self._replacement_cache = dict[int, _BoundPromptSequence]()
+
+    @property
+    def target(self) -> _BoundPromptSequence:
+        target = self._target
+
+        return _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=target if isinstance(target, str) else None,
+            _token_ids=target if isinstance(target, list) else None,
+        )
+
+    def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        replacement = self._replacement
+        if callable(replacement):
+            cache_key = item_idx
+            if cache_key in self._replacement_cache:
+                return self._replacement_cache[cache_key]
 
-        return repl_count(mm_items, hf_inputs, item_idx)
+            replacement = replacement(item_idx)
+        else:
+            cache_key = None
+
+        bound_replacement = _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=replacement if isinstance(replacement, str) else None,
+            _token_ids=replacement if isinstance(replacement, list) else None,
+        )
+
+        if cache_key is not None:
+            self._replacement_cache[cache_key] = bound_replacement
+
+        return bound_replacement
 
 
-def to_multi_format(data: MultiModalDataDict) -> dict[str, list[Any]]:
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class MultiModalDataItems(UserDict[str, list[Any]]):
     """
-    Convert a :class:`MultiModalDataDict` containing single data items
-    to a :class:`MultiModalMultiDataDict` containing multiple data items
-    per entry.
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
     """
-    multi_data = dict[str, list[Any]]()
+
+    @property
+    def image(self) -> list[ImageItem]:
+        return self["image"]
+
+    @property
+    def video(self) -> list[VideoItem]:
+        return self["video"]
+
+    @property
+    def audio(self) -> list[AudioItem]:
+        return self["audio"]
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.image[item_idx]
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems:
+    """
+    Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+    """
+    multi_data = MultiModalDataItems()
 
     for k, v in data.items():
         # yapf: disable
@@ -266,22 +264,33 @@ def iter_token_matches(
     token_ids: list[int],
     match_ids: list[int],
 ) -> Iterable[_TokenMatch]:
-    """Yield each occurrence of :code:`match_ids` in :code:`token_ids`."""
+    """
+    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
+
+    Note that empty matches are ignored.
+    """
+    prompt_len = len(token_ids)
     match_len = len(match_ids)
 
-    last_end_idx = 0
-    for start_idx in range(len(token_ids) - match_len + 1):
-        if start_idx < last_end_idx:
-            continue  # Exclude overlapping matches
+    if match_len == 0:
+        return
 
+    start_idx = 0
+    while start_idx < prompt_len - match_len + 1:
         end_idx = start_idx + match_len
+
         if token_ids[start_idx:end_idx] == match_ids:
             yield _TokenMatch(start_idx=start_idx, end_idx=end_idx)
-            last_end_idx = end_idx
+
+            # Exclude overlapping matches
+            start_idx = end_idx
+        else:
+            start_idx += 1
 
 
-class _PromptReplacementMatch(ABC, Generic[_T, _S]):
-    prompt_repl: _BoundPromptReplacement[_T]
+@dataclass(repr=False)
+class _PromptReplacementMatch(ABC):
+    prompt_repl: _BoundPromptReplacement
 
     @property
     def modality(self) -> str:
@@ -297,19 +306,13 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         raise NotImplementedError
 
-    @property
-    @abstractmethod
-    def repl_unit(self) -> _S:
-        raise NotImplementedError
-
     def __repr__(self) -> str:
         return (f"{type(self).__name__}(modality={self.modality!r}, "
                 f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
 
 
 @dataclass(repr=False)
-class _PromptReplacementTokenMatch(_PromptReplacementMatch[_T, list[int]]):
-    prompt_repl: _BoundPromptReplacement[_T]
+class _PromptReplacementTokenMatch(_PromptReplacementMatch):
     match: _TokenMatch
 
     @property
@@ -320,14 +323,9 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end_idx
 
-    @property
-    def repl_unit(self) -> list[int]:
-        return self.prompt_repl.repl_unit.token_ids
-
 
 @dataclass(repr=False)
-class _PromptReplacementTextMatch(_PromptReplacementMatch[_T, str]):
-    prompt_repl: _BoundPromptReplacement[_T]
+class _PromptReplacementTextMatch(_PromptReplacementMatch):
     match: re.Match[str]
 
     @property
@@ -338,20 +336,15 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end()
 
-    @property
-    def repl_unit(self) -> str:
-        return self.prompt_repl.repl_unit.text
-
 
 class _PlaceholderInfo(NamedTuple):
     modality: str
     start_idx: int
-    unit: list[int]
-    unit_count: int
+    replacement: list[int]
 
     @property
     def length(self) -> int:
-        return len(self.unit) * self.unit_count
+        return len(self.replacement)
 
     def to_range(self) -> PlaceholderRange:
         return PlaceholderRange(
@@ -362,8 +355,8 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[_BoundPromptReplacement[_T]],
-) -> list[_PromptReplacementTokenMatch[_T]]:
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTokenMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
         _PromptReplacementTokenMatch(prompt_repl, match)
@@ -374,8 +367,8 @@ def find_token_matches(
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[_BoundPromptReplacement[_T]],
-) -> list[_PromptReplacementTextMatch[_T]]:
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTextMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
         _PromptReplacementTextMatch(prompt_repl, match)
@@ -385,15 +378,15 @@ def find_text_matches(
 
 
 def _resolve_matches(
-    prompt: _S,
-    matches: Sequence[_PromptReplacementMatch[_T, _S]],
-) -> list[_PromptReplacementMatch[_T, _S]]:
+    prompt: _PromptSeq,
+    matches: Sequence[_PromptReplacementMatch],
+) -> list[_PromptReplacementMatch]:
     """
     Resolve :code:`matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
-    seen_matches: list[Optional[_PromptReplacementMatch[_T, _S]]] \
-        = [None] * len(prompt)
+    seen_matches: list[Optional[_PromptReplacementMatch]] = [None
+                                                             ] * len(prompt)
 
     for match in matches:
         for idx in range(match.start_idx, match.end_idx):
@@ -409,30 +402,33 @@ def _resolve_matches(
 
 def _replace_matches(
     prompt: _S,
-    matches: Sequence[_PromptReplacementMatch[_T, _S]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementMatch],
+    mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_items_by_modality}
+    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
-        mm_items = mm_items_by_modality[modality]
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= len(mm_items):
+        if item_idx >= mm_item_counts[modality]:
             continue
 
         start_idx = match.start_idx
         end_idx = match.end_idx
-        repl_unit = match.repl_unit
+
         repl_info = match.prompt_repl
-        repl_count = repl_info.get_count(mm_items, hf_inputs, item_idx)
+        replacement = repl_info.get_replacement(item_idx)
+
+        if isinstance(prompt, str):
+            repl_seq = replacement.text
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+        else:
+            repl_seq = replacement.token_ids
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
 
-        out_seqs.append(prompt[prev_end_idx:start_idx] +
-                        repl_unit * repl_count)
         prev_end_idx = end_idx
         next_idx_by_modality[modality] += 1
 
@@ -443,92 +439,104 @@ def _replace_matches(
 
 def replace_token_matches(
     prompt: list[int],
-    matches: Sequence[_PromptReplacementMatch[_T, list[int]]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementTokenMatch],
+    mm_item_counts: Mapping[str, int],
 ) -> list[int]:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    token_id_seqs = _replace_matches(
-        prompt,
-        matches,
-        mm_items_by_modality,
-        hf_inputs,
-    )
+    token_id_seqs = _replace_matches(prompt, matches, mm_item_counts)
 
     return flatten_2d_lists(token_id_seqs)
 
 
 def replace_text_matches(
     prompt: str,
-    matches: Sequence[_PromptReplacementMatch[_T, str]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementTextMatch],
+    mm_item_counts: Mapping[str, int],
 ) -> str:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    texts = _replace_matches(
-        prompt,
-        matches,
-        mm_items_by_modality,
-        hf_inputs,
-    )
+    texts = _replace_matches(prompt, matches, mm_item_counts)
 
     return "".join(texts)
 
 
-def _merge_placeholder_matches(
-    matches: Iterable[_PromptReplacementTokenMatch],
-) -> Iterable[_PromptReplacementTokenMatch]:
-    current_match = None
+def _iter_modality_placeholders(
+    prompt: list[int],
+    modality: str,
+    modality_repls: Sequence[_BoundPromptReplacement],
+    modal_item_count: int,
+) -> Iterable[_PlaceholderInfo]:
+    if modal_item_count == 0:
+        return
 
-    for match in sorted(matches, key=lambda x: x.start_idx):
-        if current_match is None:
-            current_match = match
-        elif (current_match.prompt_repl == match.prompt_repl
-              and current_match.end_idx == match.start_idx):
-            current_match = _PromptReplacementTokenMatch(
-                current_match.prompt_repl,
-                match=_TokenMatch(current_match.start_idx, match.end_idx),
-            )
-        else:
-            yield current_match
-            current_match = match
+    prompt_len = len(prompt)
+    item_index = 0
+
+    start_idx = 0
+    while start_idx < prompt_len:
+        found = False
+
+        for repl_info in modality_repls:
+            replacement = repl_info.get_replacement(item_index)
+            repl_tokens = replacement.token_ids
+            repl_len = len(repl_tokens)
+            end_idx = start_idx + repl_len
+
+            if repl_len == 0 or end_idx > prompt_len:
+                continue
 
-    if current_match is not None:
-        yield current_match
+            if prompt[start_idx:end_idx] == repl_tokens:
+                yield _PlaceholderInfo(
+                    modality=modality,
+                    start_idx=start_idx,
+                    replacement=repl_tokens,
+                )
+
+                item_index += 1
+                if item_index >= modal_item_count:
+                    return
+
+                # Exclude overlapping matches
+                start_idx = end_idx
+                found = True
+                break
+
+        if not found:
+            start_idx += 1
 
 
 def iter_placeholders(
-    prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+    prompt_repls: Sequence[_BoundPromptReplacement],
     prompt: list[int],
-    *,
-    min_unit_count: int = 1,
+    mm_item_counts: Mapping[str, int],
 ) -> Iterable[_PlaceholderInfo]:
-    """Yield each set of placeholder tokens found in :code:`token_ids`."""
-    if min_unit_count <= 0:
-        raise ValueError("`min_unit_count` must be a positive integer")
-
-    matches = (_PromptReplacementTokenMatch(prompt_repl, match)
-               for prompt_repl in prompt_repls
-               if len(repl_unit := prompt_repl.repl_unit.token_ids) > 0
-               for match in iter_token_matches(prompt, repl_unit))
-
-    for match in _merge_placeholder_matches(matches):
-        unit = match.repl_unit
-        placeholder = _PlaceholderInfo(
-            modality=match.modality,
-            start_idx=match.start_idx,
-            unit=unit,
-            unit_count=(match.end_idx - match.start_idx) // len(unit),
-        )
+    """
+    Yield each set of placeholder tokens found in :code:`prompt`.
+
+    Note that empty matches are ignored.
+    """
+    repls_by_modality = dict(full_groupby_modality(prompt_repls))
+
+    for modality, modal_item_count in mm_item_counts.items():
+        if modality in repls_by_modality:
+            yield from _iter_modality_placeholders(
+                prompt,
+                modality,
+                repls_by_modality[modality],
+                modal_item_count,
+            )
 
-        if placeholder.unit_count >= min_unit_count:
-            yield placeholder
+
+class ProcessorInputs(NamedTuple):
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`"""
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    mm_processor_kwargs: Mapping[str, object]
 
 
 class BaseMultiModalProcessor(ABC):
@@ -536,67 +544,60 @@ class BaseMultiModalProcessor(ABC):
     Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
-    def __init__(
-        self,
-        ctx: InputProcessingContext,
-        metadata: MultiModalProcessingMetadata,
-    ) -> None:
+    def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
 
         self.ctx = ctx
-        self.metadata = metadata
-        self.init_mm_processor_kwargs = (ctx.model_config.mm_processor_kwargs
-                                         or {})
 
-    def _get_hf_processor(
+    def __call__(
         self,
-        **mm_processor_kwargs: Mapping[str, object],
-    ) -> ProcessorMixin:
-        # by default, we won't pass any kwargs to the processor initialization
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        return self.apply(prompt, mm_data, mm_processor_kwargs)
+
+    def _get_hf_processor(self) -> ProcessorMixin:
+        """
+        Subclasses can add keyword arguments to this method to accept
+        additional kwargs from model config or user inputs.
+        """
         return self.ctx.get_hf_processor()
 
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def __call__(
+    @abstractmethod
+    def _get_prompt_replacements(
         self,
-        prompt: str,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
         mm_processor_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
-        return self.apply(prompt, mm_data, mm_processor_kwargs)
+    ) -> list[PromptReplacement]:
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the replacements to perform.
+
+        Note:
+            Even when the HF processor already performs replacement for us,
+            we still use this replacement information to determine
+            the placeholder token positions for each multi-modal item.
+        """
+        raise NotImplementedError
 
     def _find_placeholders(
         self,
-        all_prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+        all_prompt_repls: Sequence[_BoundPromptReplacement],
         new_token_ids: list[int],
-        *,
-        # To avoid false positives from multi-input when detecting
-        # whether placeholder tokens have been inserted, in case
-        # the target sequence is a subset of the replacement tokens
-        min_unit_count: int = 16,
+        mm_item_counts: Mapping[str, int],
     ) -> list[_PlaceholderInfo]:
         return list(
-            iter_placeholders(
-                all_prompt_repls,
-                new_token_ids,
-                min_unit_count=min_unit_count,
-            ))
+            iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
-    def _apply_hf_processor(
+    def _get_processor_data(
         self,
-        prompt: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # some mm_processor_kwargs may be used in processor initialization
-        # instead of processor call
-        processor_init_kwargs = {
-            **self.init_mm_processor_kwargs,
-            **mm_processor_kwargs,
-        }
-        hf_processor = self._get_hf_processor(**processor_init_kwargs)
-
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
         for k, v in mm_data.items():
@@ -614,12 +615,24 @@ def _apply_hf_processor(
                     processor_data[f"{k}s"] = v
             else:
                 processor_data[k] = v
+        return processor_data, passthrough_data
+
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # some mm_processor_kwargs may be used in processor initialization
+        # instead of processor call
+        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
 
-        # filter mm_processor_kwargs used in processor call
-        mm_processor_kwargs = resolve_mm_processor_kwargs(
-            self.init_mm_processor_kwargs,
-            cast(Dict[str, Any], mm_processor_kwargs),
+        processor_data, passthrough_data = self._get_processor_data(mm_data)
+
+        assert callable(hf_processor)
+        mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
             hf_processor,
+            mm_processor_kwargs,
         )
 
         try:
@@ -642,26 +655,20 @@ def _apply_hf_processor(
 
     def _bind_prompt_replacements(
         self,
-        mm_data: MultiModalDataDict,
-    ) -> list[_BoundPromptReplacement[Any]]:
+        prompt_repls: list[PromptReplacement],
+    ) -> list[_BoundPromptReplacement]:
         tokenizer = self._get_tokenizer()
 
-        return [
-            prompt_repl.bind(modality, tokenizer)
-            for modality, metadata in self.metadata.items()
-            if modality in mm_data for prompt_repl in metadata.prompt_repls
-        ]
+        return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
 
     def _apply_prompt_replacements(
         self,
-        mm_data: MultiModalDataDict,
-        hf_inputs: BatchFeature,
         token_ids: list[int],
-        prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
         tokenizer = self._get_tokenizer()
 
-        mm_items = to_multi_format(mm_data)
         token_matches = find_token_matches(token_ids, prompt_repls)
 
         # If the search text does not represent a special token,
@@ -675,14 +682,13 @@ def _apply_prompt_replacements(
         # of the search text in the prompt, we instead perform string
         # replacement on the decoded token IDs, then encode them back.
         if all(
-            len(matches) >= len(mm_items[modality])
+            len(matches) >= mm_item_counts[modality]
             for modality, matches in full_groupby_modality(token_matches)
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
                 token_matches,
-                mm_items,
-                hf_inputs,
+                mm_item_counts,
             )
 
             text = _decode(tokenizer, token_ids)
@@ -694,14 +700,14 @@ def _apply_prompt_replacements(
             text = replace_text_matches(
                 text,
                 text_matches,
-                mm_items,
-                hf_inputs,
+                mm_item_counts,
             )
 
             token_ids = _encode(tokenizer, text)
             matched_repls = [match.prompt_repl for match in text_matches]
 
-        placeholders = self._find_placeholders(matched_repls, token_ids)
+        placeholders = self._find_placeholders(matched_repls, token_ids,
+                                               mm_item_counts)
 
         return token_ids, text, placeholders
 
@@ -731,12 +737,17 @@ def apply(
         prompt_ids, = hf_inputs.pop("input_ids").tolist()
         mm_kwargs = MultiModalKwargs(hf_inputs)
 
-        all_prompt_repls = self._bind_prompt_replacements(mm_data)
+        mm_items = to_multi_format(mm_data)
+        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
+                                                     mm_processor_kwargs)
+        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
+        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
         all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids)
+                                                   prompt_ids, mm_item_counts)
+
         if all_placeholders:
             prompt_text = _decode(tokenizer, prompt_ids)
         else:
@@ -745,10 +756,9 @@ def apply(
                 prompt_text,
                 all_placeholders,
             ) = self._apply_prompt_replacements(
-                mm_data,
-                hf_inputs,
                 prompt_ids,
                 all_prompt_repls,
+                mm_item_counts,
             )
 
         mm_placeholders = {
@@ -765,13 +775,13 @@ def apply(
         )
 
     @abstractmethod
-    def _get_dummy_mm_kwargs(
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
+    ) -> ProcessorInputs:
         """
-        Build the input that corresponds to `mm_max_tokens` in
-        :meth:`get_dummy_data`.
+        Build the multi-modal portion of the input which, after processing,
+        results in `mm_max_tokens` in :meth:`get_dummy_data`.
         """
         raise NotImplementedError
 
@@ -784,38 +794,41 @@ def get_dummy_data(
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        tokenizer = self._get_tokenizer()
-
-        mm_placeholders = dict[str, _PlaceholderInfo]()
-        offset = 0
-
-        for modality, max_tokens in mm_max_tokens.items():
-            if max_tokens == 0:
-                continue
-
-            metadata = self.metadata[modality]
-            repl = metadata.prompt_repls[0].bind(modality, tokenizer)
-            repl_token_ids = repl.repl_unit.token_ids
-
-            placeholders = _PlaceholderInfo(
-                modality=modality,
-                start_idx=offset,
-                unit=repl_token_ids,
-                unit_count=max_tokens // len(repl_token_ids),
-            )
-
-            mm_placeholders[modality] = placeholders
-            offset += placeholders.length
+        processor_inputs = self._get_dummy_mm_inputs(mm_counts)
+        mm_inputs = self.apply(*processor_inputs)
+
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = dict[str, int]()
+        for modality, placeholders in placeholders_by_modality.items():
+            num_placeholders = sum(item["length"] for item in placeholders)
+            max_tokens = mm_max_tokens[modality]
+
+            if num_placeholders != max_tokens:
+                logger.warning(
+                    "The processed dummy data has a total of %d placeholder "
+                    "tokens for the '%s' modality, which is not the expected "
+                    "%d tokens.", num_placeholders, modality, max_tokens)
+
+            total_placeholders_by_modality[modality] = num_placeholders
+
+        total_len = len(prompt_token_ids)
+        if total_len > seq_len:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
 
-        prompt_token_ids = flatten_2d_lists(
-            [p.unit * p.unit_count for p in mm_placeholders.values()])
         prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
 
         return DummyData(
             seq_data=SequenceData.from_seqs(prompt_token_ids),
-            multi_modal_data=self._get_dummy_mm_kwargs(mm_counts),
-            multi_modal_placeholders={
-                modality: [p.to_range()]
-                for modality, p in mm_placeholders.items()
-            },
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
         )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 208f1a70e24ca..64d3fb4e0e469 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -200,6 +200,23 @@ def register_max_image_tokens(
         """
         return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
+    def get_max_tokens_per_item_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality
+        for profiling the memory usage of a model.
+
+        Note:
+            This is currently directly used only in V1.
+        """
+
+        return {
+            key: plugin.get_max_multimodal_tokens(model_config)
+            for key, plugin in self._plugins.items()
+        }
+
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
@@ -216,9 +233,9 @@ def get_max_tokens_by_modality(
         limits_per_plugin = self._limits_by_model[model_config]
 
         return {
-            key: (limits_per_plugin[key] *
-                  plugin.get_max_multimodal_tokens(model_config))
-            for key, plugin in self._plugins.items()
+            key: limits_per_plugin[key] * max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
         }
 
     def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
@@ -299,9 +316,9 @@ def register_processor(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._processor_factories:
+            if self._processor_factories.contains(model_cls, strict=True):
                 logger.warning(
-                    "Model class %s already has an input mapper "
+                    "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 86264f604f6bc..2ecdf74ee59b3 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,9 +1,12 @@
 import time
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, Generic, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
+import torch
+from typing_extensions import TypeVar, deprecated
+
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
@@ -57,14 +60,24 @@ class PoolingOutput:
     """The output data of one pooling output of a request.
 
     Args:
-        embedding: The embedding vector, which is a list of floats. The
-        length of vector depends on the model as listed in the embedding guide.
+        data: The extracted hidden states.
     """
-    embedding: List[float]
+    data: torch.Tensor
 
     def __repr__(self) -> str:
-        return (f"PoolingOutput("
-                f"embedding={len(self.embedding)})")
+        return (f"PoolingOutput(data={self.data})")
+
+    def __eq__(self, other: object) -> bool:
+        return (isinstance(other, self.__class__) and bool(
+            (self.data == other.data).all()))
+
+    @property
+    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
+                "attribute. To return embeddings, use `LLM.embed()`. "
+                "To return class probabilities, use `LLM.classify()` "
+                "and access the `probs` attribute. ")
+    def embedding(self) -> list[float]:
+        return self.data.tolist()
 
 
 class RequestOutput:
@@ -316,7 +329,10 @@ def __repr__(self) -> str:
                 f"multi_modal_placeholders={self.multi_modal_placeholders})")
 
 
-class PoolingRequestOutput:
+_O = TypeVar("_O", default=PoolingOutput)
+
+
+class PoolingRequestOutput(Generic[_O]):
     """
     The output data of a pooling request to the LLM.
 
@@ -327,24 +343,24 @@ class PoolingRequestOutput:
         finished (bool): A flag indicating whether the pooling is completed.
     """
 
-    def __init__(self, request_id: str, outputs: "PoolingOutput",
+    def __init__(self, request_id: str, outputs: _O,
                  prompt_token_ids: List[int], finished: bool):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
         self.finished = finished
         self.outputs = outputs
 
-    @classmethod
-    def from_seq_group(cls,
-                       seq_group: 'SequenceGroup') -> "PoolingRequestOutput":
-        if seq_group.embeddings is None:
-            raise ValueError(
-                "Embeddings are missing in seq_group for EmbeddingRequest.")
-        output = PoolingOutput(seq_group.embeddings)
+    @staticmethod
+    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
+        pooled_data = seq_group.pooled_data
+        assert pooled_data is not None
+
+        output = PoolingOutput(pooled_data)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
 
-        return cls(seq_group.request_id, output, prompt_token_ids, finished)
+        return PoolingRequestOutput(seq_group.request_id, output,
+                                    prompt_token_ids, finished)
 
     def __repr__(self):
         """
@@ -356,89 +372,135 @@ def __repr__(self):
         Returns:
             str: A string representation of the PoolingRequestOutput instance.
         """
-        return (f"PoolingRequestOutput(request_id='{self.request_id}', "
-                f"outputs={repr(self.outputs)}, "
+        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
+                f"outputs={self.outputs!r}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
                 f"finished={self.finished})")
 
 
+class RequestOutputFactory:
+
+    @staticmethod
+    def create(seq_group: SequenceGroup,
+               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
+               use_cache: bool = False):
+        if seq_group.pooled_data is not None:
+            return PoolingRequestOutput.from_seq_group(seq_group)
+        else:
+            return RequestOutput.from_seq_group(seq_group, use_cache,
+                                                seq_id_to_seq_group)
+
+
 @dataclass
-class ScoreOutput:
-    """The output data of one completion output of a request.
+class EmbeddingOutput:
+    """The output data of one embedding output of a request.
 
     Args:
-        score: The score, which is a list of floats. 
-        index: The correspondent text index of the score.
+        embedding: The embedding vector, which is a list of floats.
+        Its length depends on the hidden dimension of the model.
     """
-    index: int
-    score: List[float]
+    embedding: list[float]
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D embedding vector")
+
+        return EmbeddingOutput(pooled_data.tolist())
+
+    @property
+    def hidden_size(self) -> int:
+        return len(self.embedding)
 
     def __repr__(self) -> str:
-        return (f"ScoreOutput("
-                f"score={self.score}), "
-                f"index={self.index})")
+        return f"EmbeddingOutput(hidden_size={self.hidden_size})"
 
 
-class ScoreRequestOutput:
-    """
-    The output data of an score request to the LLM.
+class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return EmbeddingRequestOutput(
+            request_id=request_output.request_id,
+            outputs=EmbeddingOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+
+
+@dataclass
+class ClassificationOutput:
+    """The output data of one classification output of a request.
 
     Args:
-        request_id (str): A unique identifier for the score request.
-        outputs (score): The embedding results for the given input.
+        probs: The probability vector, which is a list of floats.
+        Its length depends on the number of classes.
     """
+    probs: list[float]
 
-    def __init__(self, request_id: str, outputs: "ScoreOutput"):
-        self.request_id = request_id
-        self.outputs = outputs
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D probability vector")
 
-    def __repr__(self):
-        """
-        Returns a string representation of an ScoreRequestOutput instance.
+        return ClassificationOutput(pooled_data.tolist())
 
-        The representation includes the request_id and the number of outputs,
-        providing a quick overview of the embedding request's results.
+    @property
+    def num_classes(self) -> int:
+        return len(self.probs)
 
-        Returns:
-            str: A string representation of the ScoreRequestOutput instance.
-        """
-        return (f"ScoreRequestOutput(request_id='{self.request_id}', "
-                f"outputs={repr(self.outputs)}")
+    def __repr__(self) -> str:
+        return f"ClassificationOutput(num_classes={self.num_classes})"
 
 
-class RequestOutputFactory:
+class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
 
     @staticmethod
-    def create(seq_group: SequenceGroup,
-               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
-               use_cache: bool = False):
-        # Determine the type based on a condition, for example:
-        if hasattr(seq_group,
-                   'embeddings') and seq_group.embeddings is not None:
-            return PoolingRequestOutput.from_seq_group(seq_group)
-        else:
-            return RequestOutput.from_seq_group(seq_group, use_cache,
-                                                seq_id_to_seq_group)
+    def from_base(request_output: PoolingRequestOutput):
+        return ClassificationRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ClassificationOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
 
 
-def __getattr__(name: str):
-    import warnings
+@dataclass
+class ScoringOutput:
+    """The output data of one scoring output of a request.
 
-    if name == "EmbeddingOutput":
-        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
-               "The original name will be removed in an upcoming version.")
+    Args:
+        score: The similarity score, which is a scalar value.
+    """
+    score: float
 
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 0:
+            raise ValueError("pooled_data should be a scalar score")
 
-        return PoolingOutput
+        return ScoringOutput(pooled_data.item())
 
-    if name == "EmbeddingRequestOutput":
-        msg = ("EmbeddingRequestOutput has been renamed to "
-               "PoolingRequestOutput. "
-               "The original name will be removed in an upcoming version.")
+    def __repr__(self) -> str:
+        return f"ScoringOutput(score={self.score})"
 
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+    @property
+    @deprecated("`LLM.score()` now returns scalar scores. "
+                "Please access it via the `score` attribute. ")
+    def embedding(self) -> list[float]:
+        return [self.score]
 
-        return PoolingRequestOutput
 
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ScoringRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ScoringOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index a1f2beaf22867..09bde9f065eaa 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -60,6 +60,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         cache_config = vllm_config.cache_config
 
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
 
         if kv_cache_space >= 0:
@@ -98,3 +101,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "vllm.worker.cpu_worker.CPUWorker"
             else:
                 parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ae1fd6d5ce068..3c5350b778345 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -137,6 +137,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 7f22bee3eaa74..0a44f2b74163a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
@@ -9,6 +11,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
@@ -43,3 +47,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+
+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index db06d2c18e681..4150b0cdf836a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,6 +1,7 @@
 import enum
 import platform
 import random
+from platform import uname
 from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
@@ -16,6 +17,11 @@
 logger = init_logger(__name__)
 
 
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
     FLASH_ATTN_VLLM_V1 = enum.auto()
@@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum:
 
         return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
 
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 1e5c4bddfa24f..a4bbbd27c8a89 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,5 +1,7 @@
 from typing import TYPE_CHECKING, Optional
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
@@ -7,6 +9,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
@@ -28,3 +32,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
+
+        cache_config = vllm_config.cache_config
+        if cache_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index e0f8e8b4b49fe..16eb8dc81efc2 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -34,7 +34,7 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
         return _Backend.OPENVINO
 
     @classmethod
-    def get_device_name(self, device_id: int = 0) -> str:
+    def get_device_name(cls, device_id: int = 0) -> str:
         return "openvino"
 
     @classmethod
@@ -42,19 +42,19 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return False
 
     @classmethod
-    def inference_mode(self):
+    def inference_mode(cls):
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def is_openvino_cpu(self) -> bool:
+    def is_openvino_cpu(cls) -> bool:
         return "CPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_openvino_gpu(self) -> bool:
+    def is_openvino_gpu(cls) -> bool:
         return "GPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_pin_memory_available(self) -> bool:
+    def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on OpenViNO.")
         return False
 
@@ -87,6 +87,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # check and update cache config
         ov_core = ov.Core()
         cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
             if not OpenVinoPlatform.is_openvino_cpu():
                 logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0133f26a0b1bc..7778b565372cb 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -84,6 +84,10 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 10d874349f36b..77f5c8401424b 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -46,6 +46,11 @@ def inference_mode(cls):
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         compilation_config = vllm_config.compilation_config
         if compilation_config.level == CompilationLevel.NO_COMPILATION:
             # TPU does not support NO_COMPILATION
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 11dbd04d55671..78e17c2afec65 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -51,6 +51,10 @@ def inference_mode():
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         # check and update model config
         model_config = vllm_config.model_config
         if model_config.dtype == torch.bfloat16:
@@ -78,3 +82,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.distributed_executor_backend = "ray"
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on XPU.")
+        return False
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 9d9f427e807f6..33babfebdca1e 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -72,6 +72,9 @@ class LayerwiseProfileResults(profile):
     _model_stats_tree: List[_StatsTreeNode] = field(init=False)
     _summary_stats_tree: List[_StatsTreeNode] = field(init=False)
 
+    # profile metadata
+    num_running_seqs: Optional[int] = None
+
     def __post_init__(self):
         self._build_correlation_map()
         self._build_module_tree()
@@ -127,6 +130,9 @@ def export_summary_stats_table_csv(self, filename: str):
 
     def convert_stats_to_dict(self) -> str:
         return {
+            "metadata": {
+                "num_running_seqs": self.num_running_seqs
+            },
             "summary_stats":
             self._convert_stats_tree_to_dict(self._summary_stats_tree),
             "model_stats":
@@ -338,7 +344,15 @@ def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]):
 
 class layerwise_profile(profile):
 
-    def __init__(self):
+    def __init__(self, num_running_seqs: Optional[int] = None):
+        """
+        layerwise profile constructor.
+
+        Args:
+            num_running_seqs (Optional[int], optional): When given,
+            num_running_seqs will be passed to LayerProfileResults for metadata
+            update. Defaults to None.
+        """
         super().__init__(
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=True,
@@ -346,9 +360,13 @@ def __init__(self):
             with_modules=True,
             experimental_config=_ExperimentalConfig(verbose=True))
 
+        self.num_running_seqs = num_running_seqs
+
     def __enter__(self):
         return super().__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
-        self.results = LayerwiseProfileResults(self.profiler.kineto_results)
+        self.results = LayerwiseProfileResults(
+            self.profiler.kineto_results,
+            num_running_seqs=self.num_running_seqs)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index b0f3c1cc3609f..cc3d96fc93a79 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -527,6 +527,19 @@ def hash_of_block(self, logical_idx: int) -> int:
         hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
         return hash((hashed_tokens, self.lora_int_id))
 
+    def extra_hash(self) -> Optional[int]:
+        """
+        This function computes an extra hash for a sequence, specifically
+        designed for prefix caching mode. The final sequence hash is determined
+        by applying token_ids from the sequence's blocks.
+        """
+        if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
+            return None
+
+        # NOTE: If there are additional factors influencing the block aside from
+        # token_ids, include them as input parameters to the hash.
+        return hash((self.prompt_adapter_id, self.lora_int_id))
+
     def num_hashed_tokens_of_block(self, logical_idx: int):
         return logical_idx * self.block_size + self.block_size
 
@@ -617,10 +630,9 @@ class SequenceGroup:
         sampling_params: The sampling parameters used to generate the outputs.
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
-        embeddings: The embeddings vectors of the prompt of the sequence group
-            for a pooling model.
-        pooling_params: The pooling parameters used to generate the pooling
+        pooling_params: The parameters used to generate the pooler
             for a pooling model.
+        pooled_data: The extracted hidden states from a pooling model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
@@ -635,8 +647,8 @@ def __init__(
         arrival_time: float,
         sampling_params: Optional[SamplingParams] = None,
         lora_request: Optional[LoRARequest] = None,
-        embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
+        pooled_data: Optional[torch.Tensor] = None,
         encoder_seq: Optional[Sequence] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -658,8 +670,8 @@ def __init__(
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
-        self.embeddings = embeddings
         self.pooling_params = pooling_params
+        self.pooled_data = pooled_data
         self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
@@ -1033,8 +1045,8 @@ class CompletionSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    __metaclass__ = SequenceGroupOutput
     """The model output associated with a completion sequence group."""
+    __metaclass__ = SequenceGroupOutput
     samples: List[SequenceOutput]
     # Prompt logprob for each prompt query token.
     prompt_logprobs: Optional[PromptLogprobs]
@@ -1050,23 +1062,24 @@ def __eq__(self, other: object) -> bool:
                 and self.prompt_logprobs == other.prompt_logprobs)
 
 
-class EmbeddingSequenceGroupOutput(
+class PoolingSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True,  # type: ignore[call-arg]
 ):
-    """The model output associated with an embedding sequence group."""
+    """The model output associated with a pooling sequence group."""
     __metaclass__ = SequenceGroupOutput
-    embeddings: List[int]
+    # Annotated as Any to be compatible with msgspec
+    # The actual type is in SequenceGroup.pooled_data
+    data: Any
 
     def __repr__(self) -> str:
-        return (f"EmbeddingSequenceGroupOutput("
-                f"embeddings_shape={len(self.embeddings)})")
+        return f"PoolingSequenceGroupOutput(data={self.data}"
 
     def __eq__(self, other: object) -> bool:
-        if not isinstance(other, EmbeddingSequenceGroupOutput):
+        if not isinstance(other, PoolingSequenceGroupOutput):
             raise NotImplementedError()
-        return self.embeddings == other.embeddings
+        return self.data == other.data
 
 
 # cannot use msgspec.Struct here because Dynamo does not support it
@@ -1085,7 +1098,7 @@ def __getitem__(self, key: Union[str, slice]):
         elif isinstance(key, slice):
             return self.__class__({k: v[key] for k, v in self.tensors.items()})
 
-    def __setitem__(self, key: str, value):
+    def __setitem__(self, key: str, value: torch.Tensor):
         self.tensors[key] = value
 
     def __len__(self):
@@ -1103,16 +1116,12 @@ class PoolerOutput(
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
     """The output from a pooling operation in the pooling model."""
-    outputs: List[EmbeddingSequenceGroupOutput]
-
-    # lazy import to avoid circular import
-    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
-    spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None
+    outputs: List[PoolingSequenceGroupOutput]
 
-    def __getitem__(self, idx: int) -> EmbeddingSequenceGroupOutput:
+    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
         return self.outputs[idx]
 
-    def __setitem__(self, idx: int, value):
+    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
         self.outputs[idx] = value
 
     def __len__(self):
@@ -1385,8 +1394,8 @@ def add_request(request_id: str, engine, params, **kwargs):
             arrival_time=seq_group.arrival_time,
             sampling_params=original_params,
             lora_request=seq_group.lora_request,
-            embeddings=seq_group.embeddings,
             pooling_params=seq_group.pooling_params,
+            pooled_data=seq_group.pooled_data,
             encoder_seq=seq_group.encoder_seq,
             trace_headers=seq_group.trace_headers,
             prompt_adapter_request=seq_group.prompt_adapter_request,
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 54f9f895fe541..e6701f4c4b835 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -132,7 +132,7 @@ def get_tokenizer(
     if is_from_mistral_org and tokenizer_mode != "mistral":
         warnings.warn(
             'It is strongly recommended to run mistral models with '
-            '`--tokenizer_mode "mistral"` to ensure correct '
+            '`--tokenizer-mode "mistral"` to ensure correct '
             'encoding and decoding.',
             FutureWarning,
             stacklevel=2)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 83b3c37d6f04c..17d722e3d88fe 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -314,12 +314,15 @@ def _token_to_id(t: str):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.decode(regular_tokens))  # type: ignore
+                    self.tokenizer.decode(regular_tokens))  # type: ignore
 
             decoded = ''.join(decoded_list)
 
         return decoded
 
+    # WARN: Outlines logits processors can overwrite this method.
+    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
+    # for more.
     def decode(self,
                ids: Union[List[int], int],
                skip_special_tokens: bool = True) -> str:
diff --git a/vllm/utils.py b/vllm/utils.py
index c0ae7c72592d8..9474d8b0e9d34 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -20,14 +20,15 @@
 import uuid
 import warnings
 import weakref
-from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
+from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
-from platform import uname
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generic, Hashable, List, Literal, Optional,
-                    OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
+                    Dict, Generator, Generic, Hashable, List, Literal,
+                    Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union,
+                    overload)
 from uuid import uuid4
 
 import numpy as np
@@ -344,12 +345,6 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-@lru_cache(maxsize=None)
-def in_wsl() -> bool:
-    # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
-
-
 def make_async(
     func: Callable[P, T],
     executor: Optional[concurrent.futures.Executor] = None
@@ -375,72 +370,23 @@ def _next_task(iterator: AsyncGenerator[T, None],
     return loop.create_task(iterator.__anext__())  # type: ignore[arg-type]
 
 
-async def iterate_with_cancellation(
-    iterator: AsyncGenerator[T, None],
-    is_cancelled: Callable[[], Awaitable[bool]],
-) -> AsyncGenerator[T, None]:
-    """Convert async iterator into one that polls the provided function
-    at least once per second to check for client cancellation.
-    """
-
-    loop = asyncio.get_running_loop()
-
-    awaits: List[Future[T]] = [_next_task(iterator, loop)]
-    next_cancel_check: float = 0
-    while True:
-        done, pending = await asyncio.wait(awaits, timeout=1.5)
-
-        # Check for cancellation at most once per second
-        time_now = time.time()
-        if time_now >= next_cancel_check:
-            if await is_cancelled():
-                with contextlib.suppress(BaseException):
-                    awaits[0].cancel()
-                    await iterator.aclose()
-                raise asyncio.CancelledError("client cancelled")
-            next_cancel_check = time_now + 1
-
-        if done:
-            try:
-                item = await awaits[0]
-                awaits[0] = _next_task(iterator, loop)
-                yield item
-            except StopAsyncIteration:
-                # we are done
-                return
-
-
 async def merge_async_iterators(
-    *iterators: AsyncGenerator[T, None],
-    is_cancelled: Optional[Callable[[], Awaitable[bool]]] = None,
-) -> AsyncGenerator[Tuple[int, T], None]:
+    *iterators: AsyncGenerator[T,
+                               None], ) -> AsyncGenerator[Tuple[int, T], None]:
     """Merge multiple asynchronous iterators into a single iterator.
 
     This method handle the case where some iterators finish before others.
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
-
-    It also optionally polls a provided function at least once per second
-    to check for client cancellation.
     """
 
     loop = asyncio.get_running_loop()
 
     awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)}
-    timeout = None if is_cancelled is None else 1.5
-    next_cancel_check: float = 0
     try:
         while awaits:
-            done, pending = await asyncio.wait(awaits.keys(),
-                                               return_when=FIRST_COMPLETED,
-                                               timeout=timeout)
-            if is_cancelled is not None:
-                # Check for cancellation at most once per second
-                time_now = time.time()
-                if time_now >= next_cancel_check:
-                    if await is_cancelled():
-                        raise asyncio.CancelledError("client cancelled")
-                    next_cancel_check = time_now + 1
+            done, _ = await asyncio.wait(awaits.keys(),
+                                         return_when=FIRST_COMPLETED)
             for d in done:
                 pair = awaits.pop(d)
                 try:
@@ -729,25 +675,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
-
-    if in_wsl():
-        # Pinning memory in WSL is not supported.
-        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-        print_warning_once("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
-        return False
-    elif current_platform.is_xpu():
-        print_warning_once("Pin memory is not supported on XPU.")
-        return False
-    elif current_platform.is_neuron():
-        print_warning_once("Pin memory is not supported on Neuron.")
-        return False
-    elif current_platform.is_hpu():
-        print_warning_once("Pin memory is not supported on HPU.")
-        return False
-    elif current_platform.is_cpu() or current_platform.is_openvino():
-        return False
-    return True
+    return current_platform.is_pin_memory_available()
 
 
 class DeviceMemoryProfiler:
@@ -1395,8 +1323,8 @@ def supports_kw(
 
 
 def resolve_mm_processor_kwargs(
-    init_kwargs: Optional[Dict[str, Any]],
-    inference_kwargs: Optional[Dict[str, Any]],
+    init_kwargs: Optional[Mapping[str, object]],
+    inference_kwargs: Optional[Mapping[str, object]],
     callable: Callable[..., object],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
@@ -1430,7 +1358,7 @@ def resolve_mm_processor_kwargs(
 
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
-    overrides: Optional[Dict[str, Any]],
+    overrides: Optional[Mapping[str, object]],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """
@@ -1549,9 +1477,15 @@ def __getitem__(self, key: Type[T]) -> _V:
         raise KeyError(key)
 
     def __contains__(self, key: object) -> bool:
+        return self.contains(key)
+
+    def contains(self, key: object, *, strict: bool = False) -> bool:
         if not isinstance(key, type):
             return False
 
+        if strict:
+            return key in self.data
+
         return any(cls in self.data for cls in key.mro())
 
 
@@ -1683,3 +1617,122 @@ def kill_process_tree(pid: int):
     # Finally kill the parent
     with contextlib.suppress(ProcessLookupError):
         os.kill(pid, signal.SIGKILL)
+
+
+@dataclass
+class MemorySnapshot:
+    """Memory snapshot."""
+    torch_peak_in_bytes: int = 0
+    torch_memory_in_bytes: int = 0
+    timestamp: float = 0.0
+
+    def measure(self):
+        self.torch_peak_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.peak"]
+        self.torch_memory_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        self.timestamp = time.time()
+
+    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        """support a - b"""
+        return MemorySnapshot(
+            torch_peak_in_bytes=self.torch_peak_in_bytes -
+            other.torch_peak_in_bytes,
+            torch_memory_in_bytes=self.torch_memory_in_bytes -
+            other.torch_memory_in_bytes,
+            timestamp=self.timestamp - other.timestamp)
+
+
+@dataclass
+class MemoryProfilingResult:
+    """Memory profiling result.
+    """  # noqa
+    baseline_memory_in_bytes: int = 0
+    non_kv_cache_memory_in_bytes: int = 0
+    torch_peak_increase_in_bytes: int = 0
+    non_torch_increase_in_bytes: int = 0
+    weights_memory_in_bytes: float = 0
+    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    profile_time: float = 0.0
+
+
+@contextlib.contextmanager
+def memory_profiling(
+    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
+) -> Generator[MemoryProfilingResult, None, None]:
+    """Memory profiling context manager.
+    baseline_memory_in_bytes: memory used by all the components other than
+        the current vLLM instance. It contains: memory used by other processes, memory
+        used by another vLLM instance in the same process, etc. It is usually measured
+        before the current vLLM instance initialize the device. And we assume it is
+        constant during the profiling of the current vLLM instance.
+    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
+        Note that, before loading the model weights, we also initialize the device
+        and distributed environment, which may consume some memory. This part is not
+        included in the weights_memory_in_bytes because PyTorch does not control it.
+
+    The memory in one GPU can be classified into 3 categories:
+    1. memory used by anything other than the current vLLM instance.
+    2. memory used by torch in the current vLLM instance.
+    3. memory used in the current vLLM instance, but not by torch.
+
+    A quantitive example:
+
+    Before creating the current vLLM instance:
+        category 1: 1 GiB
+        category 2: 0 GiB
+        category 3: 0 GiB
+
+    After creating the current vLLM instance and loading the model,
+    (i.e. before profiling):
+        category 1: 1 GiB
+        category 2: 2 GiB (model weights take 2 GiB)
+        category 3: 0.5 GiB (memory used by NCCL)
+
+    During profiling (peak):
+        category 1: 1 GiB
+        category 2: 4 GiB (peak activation tensors take 2 GiB)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    After profiling:
+        category 1: 1 GiB
+        category 2: 3 GiB (after garbage-collecting activation tensors)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    In this case, non-kv cache takes 5 GiB in total, including:
+    a. 2 GiB used by the model weights (category 2)
+    b. 2 GiB reserved for the peak activation tensors (category 2)
+    c. 1 GiB used by non-torch components (category 3)
+
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
+
+    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+
+    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    """ # noqa
+    torch.cuda.reset_peak_memory_stats()
+
+    result = MemoryProfilingResult()
+
+    result.baseline_memory_in_bytes = baseline_memory_in_bytes
+    # the part of memory used for holding the model weights
+    result.weights_memory_in_bytes = weights_memory_in_bytes
+
+    result.before_profile.measure()
+
+    yield result
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    result.after_profile.measure()
+
+    diff = result.after_profile - result.before_profile
+    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
+    current_cuda_memory_bytes = torch.cuda.mem_get_info(
+    )[1] - torch.cuda.mem_get_info()[0]
+    result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
+    result.profile_time = diff.timestamp
+    result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index c9f04ace644c7..026a0292cc339 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -135,6 +135,8 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        assert output is not None, "Output tensor must be provided."
+
         if attn_metadata is None:
             # Profiling run.
             return output
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b492a755e6dd5..61a3f5fd6d841 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,10 +1,12 @@
 from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, hash_block_tokens,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
                                          hash_request_tokens)
 from vllm.v1.request import Request
 
@@ -83,10 +85,12 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
 
         computed_blocks = []
 
-        # TODO(rickyx): potentially we could cache this so we don't have to
-        # recompute it every time.
-        block_hashes = hash_request_tokens(self.block_size,
-                                           request.all_token_ids)
+        # The block hashes for the request may already be computed
+        # if the request was preempted and resumed.
+        if not request.kv_block_hashes:
+            request.set_kv_block_hashes(
+                hash_request_tokens(self.block_size, request))
+        block_hashes = request.kv_block_hashes
 
         for block_hash in block_hashes:
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -164,13 +168,14 @@ def append_slots(
 
         new_full_blocks = req_blocks[
             num_computed_full_blocks:num_full_blocks_after_append]
-        self._cache_full_blocks(
-            request=request,
-            blk_start_idx=num_computed_full_blocks,
-            full_blocks=new_full_blocks,
-            prev_block=req_blocks[num_computed_full_blocks - 1]
-            if num_computed_full_blocks >= 1 else None,
-        )
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=num_computed_full_blocks,
+                full_blocks=new_full_blocks,
+                prev_block=req_blocks[num_computed_full_blocks - 1]
+                if num_computed_full_blocks >= 1 else None,
+            )
 
         return new_blocks
 
@@ -241,14 +246,16 @@ def allocate_slots(
         num_computed_tokens = len(computed_blocks) * self.block_size
         num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
 
-        self._cache_full_blocks(
-            request=request,
-            blk_start_idx=len(computed_blocks),
-            # The new full blocks are the full blocks that are not computed.
-            full_blocks=self.req_to_blocks[request.request_id]
-            [len(computed_blocks):num_full_blocks],
-            prev_block=computed_blocks[-1] if computed_blocks else None,
-        )
+        new_full_blocks = self.req_to_blocks[
+            request.request_id][len(computed_blocks):num_full_blocks]
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=len(computed_blocks),
+                # The new full blocks are the full blocks that are not computed.
+                full_blocks=new_full_blocks,
+                prev_block=computed_blocks[-1] if computed_blocks else None,
+            )
 
         return new_blocks
 
@@ -262,12 +269,13 @@ def free(self, request: Request) -> None:
         """
         # Default to [] in case a request is freed (aborted) before alloc.
         blocks = self.req_to_blocks.pop(request.request_id, [])
+        ordered_blocks: Iterable[KVCacheBlock] = blocks
         if self.enable_caching:
             # Free blocks in reverse order so that the tail blocks are
             # freed first.
-            blocks = reversed(blocks)
+            ordered_blocks = reversed(blocks)
 
-        for block in blocks:
+        for block in ordered_blocks:
             block.decr_ref()
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
@@ -374,26 +382,50 @@ def _cache_full_blocks(
             full_blocks: The list of blocks to update hash metadata.
             prev_block: The previous block in the chain.
         """
+        num_cached_block_hashes = len(request.kv_block_hashes)
+
         # Update the new blocks with the block hashes through the chain.
-        prev_block_hash = (prev_block.block_hash
-                           if prev_block is not None else None)
+        prev_block_hash_value = None
+        if prev_block is not None:
+            # Previous block must have a block hash because it must be
+            # a full, cached block.
+            assert prev_block.block_hash is not None
+            prev_block_hash_value = prev_block.block_hash.hash_value
+
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 
-            block_tokens = request.all_token_ids[blk_idx *
-                                                 self.block_size:(blk_idx +
-                                                                  1) *
-                                                 self.block_size]
-            assert len(block_tokens) == self.block_size, (
-                f"Expected {self.block_size} tokens, got {len(block_tokens)} "
-                f"at {blk_idx}th block for request "
-                f"{request.request_id}({request})")
-
-            # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash,
-                                           tuple(block_tokens))
+            if blk_idx < num_cached_block_hashes:
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption). In this case we simply
+                # reuse the block hash.
+                block_hash = request.kv_block_hashes[blk_idx]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                start_token_idx = blk_idx * self.block_size
+                end_token_idx = (blk_idx + 1) * self.block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == self.block_size, (
+                    f"Expected {self.block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                request.append_kv_block_hashes(block_hash)
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
             self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
-            prev_block_hash = block_hash
+            prev_block_hash_value = block_hash.hash_value
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index fb666c364bfb2..d80ea128c7749 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,12 +1,25 @@
 """KV-Cache Utilities."""
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import Any, List, NamedTuple, Optional, Tuple
 
 from vllm.logger import init_logger
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
-BlockHashType = Tuple[int, Tuple[int]]
+
+class BlockHashType(NamedTuple):
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    The reason we keep a tuple of token IDs and extra keys is to make sure
+    no hash collision happens when the hash value is the same.
+    """
+    # Hash value of the block in an integer.
+    hash_value: int
+    # Token IDs in the block.
+    token_ids: Tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
 
 
 @dataclass
@@ -72,8 +85,8 @@ def __init__(self, blocks: List[KVCacheBlock]) -> None:
         self.num_free_blocks = len(blocks)
 
         # Initialize the doubly linked list of free blocks.
-        self.free_list_head = blocks[0]
-        self.free_list_tail = blocks[-1]
+        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
+        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
         for i in range(self.num_free_blocks):
             if i > 0:
                 blocks[i].prev_free_block = blocks[i - 1]
@@ -151,8 +164,80 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
         return ret
 
 
-def hash_block_tokens(parent_block_hash: Optional[int],
-                      curr_block_token_ids: Tuple[int]) -> BlockHashType:
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+    For multi-modal inputs, the extra keys are (mm_hash, start_offset) that
+    indicate a mm input contained in the block and its starting offset in
+    the block tokens.
+    
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+    
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if not mm_positions:
+        return None, start_mm_idx
+
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match. This "
+            "is likely because you do not enable MM preprocessor hashing. "
+            "Please set mm_cache_preprocessor=True.")
+
+    # Note that we assume mm_positions is sorted by offset.
+    # We do not need to check all mm inputs if the start token index is out of
+    # range. This usually happens in the late prefill phase and decoding phase.
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
+        return None, start_mm_idx
+
+    # Support start_mm_idx == -1 to indicate the last mm input.
+    if start_mm_idx < 0:
+        assert -start_mm_idx <= len(mm_positions)
+        start_mm_idx = len(mm_positions) + start_mm_idx
+
+    extra_keys = []
+    curr_mm_idx = start_mm_idx
+    while mm_positions and curr_mm_idx < len(mm_positions):
+        assert mm_hashes[curr_mm_idx] is not None
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
+        if end_token_idx > offset:
+            if start_token_idx > offset + length:
+                # This block has passed the current mm input.
+                curr_mm_idx += 1
+                continue
+
+            # The block contains the current mm input.
+            mm_start = max(0, start_token_idx - offset)
+            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            if end_token_idx >= offset + length:
+                # If this block contains the end of the current mm input,
+                # move to the next mm input as this block may also contain
+                # the next mm input.
+                curr_mm_idx += 1
+            else:
+                # Otherwise this block is done with mm inputs.
+                break
+        else:
+            # This block has not reached the current mm input.
+            break
+    return tuple(extra_keys), curr_mm_idx
+
+
+def hash_block_tokens(
+        parent_block_hash: Optional[int],
+        curr_block_token_ids: Sequence[int],
+        extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -164,38 +249,57 @@ def hash_block_tokens(parent_block_hash: Optional[int],
     Args:
         parent_block_hash: The hash of the parent block. None
             if this is the first block.
-        curr_block_token_ids: A tuple of token ids in the current
+        curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
+        extra_keys: Extra keys for the block.
 
     Returns:
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
-    return (hash(
-        (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids)
+    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
+                         tuple(curr_block_token_ids), extra_keys)
 
 
 def hash_request_tokens(block_size: int,
-                        token_ids: List[int]) -> List[BlockHashType]:
+                        request: Request) -> List[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
     Args:
         block_size: The size of each block.
-        token_ids: A sequence of token ids in the request.
+        request: The request object.
 
     Returns:
         The list of computed hash values.
     """
+    token_ids = request.all_token_ids
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match.")
+
+    # TODO: Extend this to support other features such as LoRA.
+    need_extra_keys = bool(mm_positions)
+    extra_keys = None
+    curr_mm_idx = 0
+
     ret = []
-    parent_block_hash = None
+    parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
         end = start + block_size
-        block_token_ids = tuple(token_ids[start:end])
+        block_token_ids = token_ids[start:end]
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
-        block_hash = hash_block_tokens(parent_block_hash, block_token_ids)
+
+        # Add extra keys if the block is a multi-modal block.
+        if need_extra_keys:
+            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start, end, curr_mm_idx)
+
+        block_hash = hash_block_tokens(parent_block_hash_value,
+                                       block_token_ids, extra_keys)
         ret.append(block_hash)
-        parent_block_hash = block_hash
+        parent_block_hash_value = block_hash.hash_value
     return ret
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index a3e85c20cc664..08e7c0fd4dc9b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -73,14 +73,13 @@ def __init__(
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
         # projector if needed). Currently, we assume that the encoder also
         # has the Transformer architecture (e.g., ViT).
-        # FIXME(woosuk): Below are placeholder values. We need to calculate the
-        # actual values from the configurations.
-        self.max_num_encoder_input_tokens = 16384
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
         # NOTE(woosuk): For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized and used, regardless of
         # the cache size. This is because the memory space for the encoder cache
         # is preallocated in the profiling run.
-        self.encoder_cache_manager = EncoderCacheManager(cache_size=16384)
+        self.encoder_cache_manager = EncoderCacheManager(
+            cache_size=self.scheduler_config.encoder_cache_size)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
@@ -152,6 +151,7 @@ def schedule(self) -> "SchedulerOutput":
                     break
             if not can_schedule:
                 break
+            assert new_blocks is not None
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
@@ -199,9 +199,13 @@ def schedule(self) -> "SchedulerOutput":
                 if num_new_tokens == 0:
                     # The happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
-                    # the last token.
-                    num_computed_tokens -= 1
-                    num_new_tokens = 1
+                    # the last block. Note that we have to re-compute an entire
+                    # block because allocate_slots() assumes num_computed_tokens
+                    # is always a multiple of the block size. This limitation
+                    # can potentially be removed in the future to slightly
+                    # improve the performance.
+                    num_computed_tokens -= self.block_size
+                    num_new_tokens = self.block_size
                     computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
@@ -512,6 +516,7 @@ class NewRequestData:
     prompt_token_ids: List[int]
     prompt: Optional[str]
     mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
     mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     block_ids: List[int]
@@ -529,6 +534,7 @@ def from_request(
             prompt_token_ids=request.prompt_token_ids,
             prompt=request.prompt,
             mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3cf0e610ae7af..cc0c7ea23469a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -35,7 +35,8 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[MultiModalKwargs]]
+    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_hashes: Optional[List[str]]
     mm_placeholders: Optional[MultiModalPlaceholderDict]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
@@ -43,10 +44,11 @@ class EngineCoreRequest:
     lora_request: Optional[LoRARequest]
 
 
-class EngineCoreOutput(msgspec.Struct,
-                       array_like=True,
-                       omit_defaults=True,
-                       gc=False):
+class EngineCoreOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     request_id: str
     new_token_ids: List[int]
@@ -55,10 +57,11 @@ class EngineCoreOutput(msgspec.Struct,
     stop_reason: Union[int, str, None] = None
 
 
-class EngineCoreOutputs(msgspec.Struct,
-                        array_like=True,
-                        omit_defaults=True,
-                        gc=False):
+class EngineCoreOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     #NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout and using an int enum for finish/stop reason
@@ -80,3 +83,6 @@ class EngineCoreRequestType(enum.Enum):
     ADD = b'\x00'
     ABORT = b'\x01'
     PROFILE = b'\x02'
+
+
+EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 24cafeff63d1e..41fb4b25d45bb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -60,9 +60,13 @@ def __init__(
         self.client_aborted_requests: List[str] = []
 
         # Processor (converts Inputs --> EngineCoreRequests).
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry)
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry,
+        )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
         self.detokenizer = Detokenizer(
@@ -81,7 +85,7 @@ def __init__(
             asyncio_mode=True,
         )
 
-        self.output_handler = None
+        self.output_handler: Optional[asyncio.Task] = None
 
     def __del__(self):
         self.shutdown()
@@ -126,7 +130,8 @@ def shutdown(self):
             handler.cancel()
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "mp":
@@ -361,10 +366,10 @@ async def check_health(self) -> None:
         logger.debug("Called check_health.")
 
     async def start_profile(self) -> None:
-        await self.engine_core.profile(True)
+        await self.engine_core.profile_async(True)
 
     async def stop_profile(self) -> None:
-        await self.engine_core.profile(False)
+        await self.engine_core.profile_async(False)
 
     @property
     def is_running(self) -> bool:
@@ -380,7 +385,7 @@ def errored(self) -> bool:
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception
+        return Exception()  # TODO: implement
 
 
 # Retain V0 name for backwards compatibility.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 55a5c4dff3a5c..497d5db5b4c99 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,25 +1,27 @@
-import multiprocessing
 import pickle
 import queue
 import signal
 import threading
 import time
+from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
-from multiprocessing.sharedctypes import Synchronized
-from typing import List, Tuple, Type, Union
+from typing import List, Tuple, Type
 
 import zmq
 import zmq.asyncio
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType)
-from vllm.v1.engine.mm_input_mapper import MMInputMapper
+                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
@@ -56,9 +58,6 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
-        # Set up multimodal input mapper (e.g., convert PIL images to tensors).
-        self.mm_input_mapper = MMInputMapper(vllm_config.model_config)
-
         # Setup scheduler.
         self.scheduler = Scheduler(vllm_config.scheduler_config,
                                    vllm_config.cache_config,
@@ -66,6 +65,9 @@ def __init__(
 
         self._last_logging_time = time.time()
 
+        self.mm_input_mapper_server = MMInputMapperServer(
+            vllm_config.model_config)
+
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
         start = time.time()
@@ -89,7 +91,19 @@ def _initialize_kv_caches(self,
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
+
+        if request.mm_hashes is not None:
+            # Here, if hash exists for an image, then it will be fetched
+            # from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client side of the
+            # MM mapper, so anything that has a hash must have a HIT cache
+            # entry here as well.
+            assert request.mm_inputs is not None
+            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+                request.mm_inputs, request.mm_hashes)
+
         req = Request.from_engine_core_request(request)
+
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
@@ -116,10 +130,18 @@ def step(self) -> List[EngineCoreOutput]:
     def shutdown(self):
         self.model_executor.shutdown()
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
 
+@dataclass
+class EngineCoreProcHandle:
+    proc: BaseProcess
+    ready_path: str
+    input_path: str
+    output_path: str
+
+
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
@@ -133,20 +155,16 @@ def __init__(
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
     ):
         super().__init__(vllm_config, executor_class, usage_context)
 
-        # Signal from main process to shutdown (multiprocessing.Value).
-        self.should_shutdown = should_shutdown
-
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue = queue.Queue()
-        self.output_queue = queue.Queue()
+        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
+        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
                          daemon=True).start()
@@ -195,13 +213,8 @@ def make_engine_core_process(
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
-    ) -> BaseProcess:
-        # The current process might have CUDA context,
-        # so we need to spawn a new process.
-        # NOTE(rob): this is a problem for using EngineCoreProc w/
-        # LLM, since we need a if __name__ == "__main__" guard.
-        context = multiprocessing.get_context("spawn")
+    ) -> EngineCoreProcHandle:
+        context = get_mp_context()
 
         process_kwargs = {
             "input_path": input_path,
@@ -210,7 +223,6 @@ def make_engine_core_process(
             "vllm_config": vllm_config,
             "executor_class": executor_class,
             "usage_context": usage_context,
-            "should_shutdown": should_shutdown
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=EngineCoreProc.run_engine_core,
@@ -219,7 +231,10 @@ def make_engine_core_process(
 
         # Wait for startup
         EngineCoreProc.wait_for_startup(proc, ready_path)
-        return proc
+        return EngineCoreProcHandle(proc=proc,
+                                    ready_path=ready_path,
+                                    input_path=input_path,
+                                    output_path=output_path)
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
@@ -230,6 +245,9 @@ def run_engine_core(*args, **kwargs):
         # processes to terminate without error
         shutdown_requested = False
 
+        # Ensure we can serialize transformer config after spawning
+        maybe_register_config_serialize_by_value()
+
         def signal_handler(signum, frame):
             nonlocal shutdown_requested
             if not shutdown_requested:
@@ -260,8 +278,8 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        # Loop until we get a shutdown signal.
-        while not self.should_shutdown:
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
@@ -272,8 +290,6 @@ def run_busy_loop(self):
                     except queue.Empty:
                         self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
-                        if self.should_shutdown:
-                            return
                     except BaseException:
                         raise
 
@@ -304,15 +320,13 @@ def _log_stats(self):
 
             self._last_logging_time = now
 
-    def _handle_client_request(
-        self, request: Union[EngineCoreRequest, EngineCoreProfile,
-                             List[str]]) -> None:
+    def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
 
         if isinstance(request, EngineCoreRequest):
             self.add_request(request)
         elif isinstance(request, EngineCoreProfile):
-            self.model_executor.worker.profile(request.is_start)
+            self.model_executor.profile(request.is_start)
         else:
             # TODO: make an EngineCoreAbort wrapper
             assert isinstance(request, list)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4d96b323d1662..d56fcbdb1e7c4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,6 +1,6 @@
-import atexit
-import multiprocessing
-from typing import List, Union
+import os
+import weakref
+from typing import List, Optional
 
 import msgspec
 import zmq
@@ -10,8 +10,9 @@
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType)
-from vllm.v1.engine.core import EngineCore, EngineCoreProc
+                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
+                                 EngineCoreProcHandle)
 from vllm.v1.serial_utils import PickleEncoder
 
 logger = init_logger(__name__)
@@ -59,7 +60,7 @@ def get_output(self) -> List[EngineCoreOutput]:
     def add_request(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         raise NotImplementedError
 
     def abort_requests(self, request_ids: List[str]) -> None:
@@ -71,6 +72,9 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
+    async def profile_async(self, is_start: bool = True) -> None:
+        raise NotImplementedError
+
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -105,7 +109,7 @@ def shutdown(self):
     def __del__(self):
         self.shutdown()
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         self.engine_core.profile(is_start)
 
 
@@ -133,7 +137,10 @@ def __init__(
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context())
+        if asyncio_mode:
+            self.ctx = zmq.asyncio.Context()
+        else:
+            self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
         # Path for IPC.
         ready_path = get_open_zmq_ipc_path()
@@ -149,34 +156,40 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.should_shutdown = multiprocessing.Value('b', False, lock=False)
-        self.proc = EngineCoreProc.make_engine_core_process(
+        self.proc_handle: Optional[EngineCoreProcHandle]
+        self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
-            input_path=input_path,
-            output_path=output_path,
-            ready_path=ready_path,
-            should_shutdown=self.should_shutdown,
+            input_path=
+            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
+            output_path=output_path,  # type: ignore[misc]
+            ready_path=ready_path,  # type: ignore[misc]
             **kwargs,
         )
-        atexit.register(self.shutdown)
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
     def shutdown(self):
-        # Send shutdown signal to background process.
-        self.should_shutdown = True
-
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
-        # Shutdown the process if needed.
-        if hasattr(self, "proc") and self.proc.is_alive():
-            self.proc.terminate()
-            self.proc.join(5)
+        if hasattr(self, "proc_handle") and self.proc_handle:
+            # Shutdown the process if needed.
+            if self.proc_handle.proc.is_alive():
+                self.proc_handle.proc.terminate()
+                self.proc_handle.proc.join(5)
 
-            if self.proc.is_alive():
-                kill_process_tree(self.proc.pid)
+                if self.proc_handle.proc.is_alive():
+                    kill_process_tree(self.proc_handle.proc.pid)
 
-    def __del__(self):
-        self.shutdown()
+            # Remove zmq ipc socket files
+            ipc_sockets = [
+                self.proc_handle.ready_path, self.proc_handle.output_path,
+                self.proc_handle.input_path
+            ]
+            for ipc_socket in ipc_sockets:
+                socket_file = ipc_socket.replace("ipc://", "")
+                if os and os.path.exists(socket_file):
+                    os.remove(socket_file)
+            self.proc_handle = None
 
 
 class SyncMPClient(MPClient):
@@ -191,10 +204,8 @@ def get_output(self) -> List[EngineCoreOutput]:
         engine_core_outputs = self.decoder.decode(frame.buffer).outputs
         return engine_core_outputs
 
-    def _send_input(
-        self, request_type: EngineCoreRequestType,
-        request: Union[EngineCoreRequest, EngineCoreProfile,
-                       List[str]]) -> None:
+    def _send_input(self, request_type: EngineCoreRequestType,
+                    request: EngineCoreRequestUnion) -> None:
 
         # (RequestType, SerializedRequest)
         msg = (request_type.value, self.encoder.encode(request))
@@ -206,7 +217,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
                          EngineCoreProfile(is_start))
 
@@ -224,10 +235,8 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
 
         return engine_core_outputs
 
-    async def _send_input(
-        self, request_type: EngineCoreRequestType,
-        request: Union[EngineCoreRequest, EngineCoreProfile,
-                       List[str]]) -> None:
+    async def _send_input(self, request_type: EngineCoreRequestType,
+                          request: EngineCoreRequestUnion) -> None:
 
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
@@ -239,6 +248,6 @@ async def abort_requests_async(self, request_ids: List[str]) -> None:
         if len(request_ids) > 0:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    async def profile(self, is_start=True) -> None:
+    async def profile_async(self, is_start: bool = True) -> None:
         await self._send_input(EngineCoreRequestType.PROFILE,
                                EngineCoreProfile(is_start))
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6249d60199a62..02f34e2b54dd5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -97,7 +97,7 @@ def add_tokens(
         self,
         new_token_ids: List[int],
         finish_reason: Optional[str],
-        stop_reason: Optional[str],
+        stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1b3a9f12d009e..bea8c5502f612 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -55,9 +55,12 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry, mm_registry)
+        self.processor = Processor(model_config=vllm_config.model_config,
+                                   cache_config=vllm_config.cache_config,
+                                   lora_config=vllm_config.lora_config,
+                                   tokenizer=self.tokenizer,
+                                   input_registry=input_registry,
+                                   mm_registry=mm_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
         self.detokenizer = Detokenizer(
@@ -103,7 +106,8 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "mp":
@@ -196,3 +200,10 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 7ad6882b04520..e53ba092ede04 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,11 +1,35 @@
 from typing import Any, Dict, List, Optional
 
+import PIL
+from blake3 import blake3
+
 from vllm.config import ModelConfig
+from vllm.inputs import PromptType
+from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
+from vllm.v1.utils import LRUDictCache
+
+logger = init_logger(__name__)
+
+# The idea of MM preprocessor caching is based on having a client and a server,
+# where the client executes in the frontend process (=P0) and the server in the
+# core process (=P1).
+#
+# -- Client: Executes the MM mapper and performs caching of the results.
+# -- Server: Performs caching of the results
+#
+# The caching for both client and server is mirrored/similar, and this allows us
+# to avoid the serialization of "mm_inputs" (like pixel values) between
+# client (=P0) and server (=P1) processes.
 
+# Both Client and Server must use the same cache size
+# (to perform mirrored caching)
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
 
-class MMInputMapper:
+
+class MMInputMapperClient:
 
     def __init__(
         self,
@@ -18,23 +42,135 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
+        # Init cache
+        self.use_cache = model_config.mm_cache_preprocessor
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+
+        # DEBUG: Set to None to disable
+        self.mm_debug_cache_hit_ratio_steps = None
+        self.mm_cache_hits = 0
+        self.mm_cache_total = 0
+
+    def cache_hit_ratio(self, steps):
+        if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
+            logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
+                         self.mm_cache_hits / self.mm_cache_total)
+
+    # TODO: Support modalities beyond image.
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
+        mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
+        precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
     ) -> List[MultiModalKwargs]:
+        if precomputed_mm_inputs is None:
+            image_inputs = mm_data["image"]
+            if not isinstance(image_inputs, list):
+                image_inputs = [image_inputs]
+            num_inputs = len(image_inputs)
+        else:
+            num_inputs = len(precomputed_mm_inputs)
+
+        # Sanity
+        if self.use_cache:
+            assert mm_hashes is not None
+            assert num_inputs == len(mm_hashes)
+
+        # Process each image input separately, so that later we can schedule
+        # them in a fine-grained manner.
+        # Apply caching (if enabled) and reuse precomputed inputs (if provided)
+        ret_inputs: List[MultiModalKwargs] = []
+        for input_id in range(num_inputs):
+            if self.mm_debug_cache_hit_ratio_steps is not None:
+                self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
+
+            mm_input = None
+            if self.use_cache:
+                assert mm_hashes is not None
+                mm_hash = mm_hashes[input_id]
+                mm_input = self.mm_cache.get(mm_hash)
+
+            self.mm_cache_total += 1
+            if mm_input is None:
+                if precomputed_mm_inputs is not None:
+                    # Reuse precomputed input (for merged preprocessor)
+                    mm_input = precomputed_mm_inputs[input_id]
+                else:
+                    # Apply MM mapper
+                    mm_input = self.multi_modal_input_mapper(
+                        {"image": [image_inputs[input_id]]},
+                        mm_processor_kwargs=mm_processor_kwargs,
+                    )
+
+                if self.use_cache:
+                    # Add to cache
+                    assert mm_hash is not None
+                    self.mm_cache.put(mm_hash, mm_input)
+            else:
+                self.mm_cache_hits += 1
+                mm_input = None  # Avoids sending mm_input to Server
+
+            ret_inputs.append(mm_input)
+
+        return ret_inputs
+
+
+class MMInputMapperServer:
+
+    def __init__(self, model_config):
+        self.use_cache = model_config.mm_cache_preprocessor
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+
+    def process_inputs(
+        self,
+        mm_inputs: List[Optional[MultiModalKwargs]],
+        mm_hashes: List[str],
+    ) -> List[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        if not self.use_cache:
+            return mm_inputs
+
+        full_mm_inputs = []
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            assert mm_hash is not None
+            if mm_input is None:
+                mm_input = self.mm_cache.get(mm_hash)
+                assert mm_input is not None
+            else:
+                self.mm_cache.put(mm_hash, mm_input)
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+
+class MMHasher:
+
+    def __init__(self):
+        pass
+
+    def hash(self, prompt: PromptType) -> Optional[List[str]]:
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
         image_inputs = mm_data["image"]
         if not isinstance(image_inputs, list):
             image_inputs = [image_inputs]
+        assert len(image_inputs) > 0
 
-        # Process each image input separately so that later we can schedule
-        # them in a fine-grained manner.
-        mm_inputs: List[MultiModalKwargs] = []
-        num_images = len(image_inputs)
-        for i in range(num_images):
-            mm_input = self.multi_modal_input_mapper(
-                {"image": image_inputs[i]},
-                mm_processor_kwargs=mm_processor_kwargs,
-            )
-            mm_inputs.append(mm_input)
-        return mm_inputs
+        ret = []
+        for image in image_inputs:
+            assert isinstance(image, PIL.Image.Image)
+
+            # Convert image to bytes
+            bytes = image.tobytes()
+
+            # Hash image bytes
+            hasher = blake3()
+            hasher.update(bytes)
+            ret.append(hasher.hexdigest())
+
+        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index e0e525b30a767..732757d6b0ac2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,7 +1,7 @@
 import time
 from typing import Any, Dict, Mapping, Optional, Tuple, Union
 
-from vllm.config import LoRAConfig, ModelConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs
@@ -15,7 +15,7 @@
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMInputMapper
+from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
 class Processor:
@@ -23,6 +23,7 @@ class Processor:
     def __init__(
         self,
         model_config: ModelConfig,
+        cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
@@ -42,7 +43,12 @@ def __init__(
             model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_mapper = MMInputMapper(model_config)
+        self.mm_input_mapper_client = MMInputMapperClient(model_config)
+
+        # Multi-modal hasher (for images)
+        self.use_hash = model_config.mm_cache_preprocessor or \
+            cache_config.enable_prefix_caching
+        self.mm_hasher = MMHasher()
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -52,7 +58,7 @@ def process_inputs(
         request_id: str,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
-        arrival_time: float,
+        arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -71,6 +77,11 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.use_hash:
+            mm_hashes = self.mm_hasher.hash(prompt)
+
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -101,16 +112,17 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
-        # Preprocess multi-modal data
-        if len(decoder_inputs.multi_modal_data) == 0:
-            mm_inputs = None
-        elif isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
-            mm_inputs = [decoder_inputs.multi_modal_data]
-        else:
-            mm_inputs = self.mm_input_mapper.process_inputs(
-                decoder_inputs.multi_modal_data,
-                decoder_inputs.mm_processor_kwargs,
-            )
+        # For merged preprocessor, mm_data is already mm_inputs
+        precomputed_mm_inputs = None
+        if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
+            precomputed_mm_inputs = [decoder_inputs.multi_modal_data]
+
+        # Apply MM mapper
+        mm_inputs = None
+        if len(decoder_inputs.multi_modal_data) > 0:
+            mm_inputs = self.mm_input_mapper_client.process_inputs(
+                decoder_inputs.multi_modal_data, mm_hashes,
+                decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
         # Make Request for Detokenizer.
         detokenizer_request = DetokenizerRequest(
@@ -130,6 +142,7 @@ def process_inputs(
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
             mm_inputs,
+            mm_hashes,
             decoder_inputs.multi_modal_placeholders,
             sampling_params,
             eos_token_id,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 9cd267581ad18..564d0447f15a6 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Tuple
+from typing import Tuple
 
 from vllm.config import VllmConfig
 from vllm.v1.outputs import ModelRunnerOutput
@@ -28,7 +28,7 @@ def execute_model(
         raise NotImplementedError
 
     @abstractmethod
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         raise NotImplementedError
 
     @abstractmethod
@@ -38,11 +38,3 @@ def shutdown(self):
     @abstractmethod
     def check_health(self) -> None:
         raise NotImplementedError
-
-    @abstractmethod
-    def collective_rpc(self,
-                       method: str,
-                       timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> []:
-        raise NotImplementedError
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index f8f3d583618cf..128101aa6956d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,13 +1,13 @@
-import atexit
 import os
 import pickle
 import signal
 import sys
 import time
+import weakref
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing.process import BaseProcess
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import zmq
 
@@ -21,6 +21,7 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_open_port,
                         get_open_zmq_ipc_path)
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import make_zmq_socket
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -31,12 +32,12 @@
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
 
-class MultiprocExecutor:
+class MultiprocExecutor(Executor):
 
     def __init__(self, vllm_config: VllmConfig) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
-        atexit.register(self.shutdown)
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
         self.vllm_config = vllm_config
         self.parallel_config = vllm_config.parallel_config
@@ -103,7 +104,7 @@ def collective_rpc(self,
                        method: str,
                        timeout: Optional[float] = None,
                        args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> []:
+                       kwargs: Optional[Dict] = None) -> List[Any]:
         """
         Execute an RPC call on workers.
         
@@ -125,7 +126,7 @@ def collective_rpc(self,
 
             responses = [None] * self.world_size
             for w in self.workers:
-                dequeue_timeout = timeout - (time.monotonic() - start_time()
+                dequeue_timeout = timeout - (time.monotonic() - start_time
                                              ) if timeout is not None else None
                 status, result = w.worker_response_mq.dequeue(
                     timeout=dequeue_timeout)
@@ -153,7 +154,7 @@ def execute_model(
                                            args=(scheduler_output, ))[0]
         return model_output
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         self.collective_rpc("profile", args=(is_start, ))
         return
 
@@ -163,6 +164,10 @@ def _ensure_worker_termination(self):
         termination and kill signals if needed."""
 
         def wait_for_termination(procs, timeout):
+            if not time:
+                # If we are in late stage shutdown, the interpreter may replace
+                # `time` with `None`.
+                return all(not proc.is_alive() for proc in procs)
             start_time = time.time()
             while time.time() - start_time < timeout:
                 if all(not proc.is_alive() for proc in procs):
@@ -172,21 +177,28 @@ def wait_for_termination(procs, timeout):
 
         # Send SIGTERM if still running
         active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
-        self.workers = None
         for p in active_procs:
             p.terminate()
-        if wait_for_termination(active_procs, 4):
-            return
+        if not wait_for_termination(active_procs, 4):
+            # Send SIGKILL if still running
+            active_procs = [p for p in active_procs if p.is_alive()]
+            for p in active_procs:
+                p.kill()
 
-        # Send SIGKILL if still running
-        active_procs = [p for p in active_procs if p.is_alive()]
-        for p in active_procs:
-            p.kill()
+        self._cleanup_sockets()
+
+    def _cleanup_sockets(self):
+        for w in self.workers:
+            # Remove the zmq ipc socket file
+            socket_path = w.ready_path.replace("ipc://", "")
+            if os and os.path.exists(socket_path):
+                os.remove(socket_path)
 
     def shutdown(self):
         """Properly shut down the executor and its workers"""
-        if (hasattr(self, 'workers') and self.workers is not None):
-            for w in self.workers:  #TODO: not sure if needed
+        if getattr(self, 'shutting_down', False):
+            self.shutting_down = True
+            for w in self.workers:
                 w.worker_response_mq = None
             self._ensure_worker_termination()
 
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index 9b1d9a40950c6..be058318de58b 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -4,13 +4,14 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_worker import Worker
 
 logger = init_logger(__name__)
 
 
-class UniprocExecutor:
+class UniprocExecutor(Executor):
 
     def __init__(self, vllm_config: VllmConfig) -> None:
         self.vllm_config = vllm_config
@@ -25,7 +26,7 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
-        self.worker = self._create_worker()
+        self.worker: Worker = self._create_worker()
         self.worker.initialize()
         self.worker.load_model()
 
@@ -75,7 +76,7 @@ def profile(self, is_start: bool = True):
         self.worker.profile(is_start)
 
     def shutdown(self):
-        self.worker = None
+        pass
 
     def check_health(self) -> None:
         # UniprocExecutor will always be healthy as long as
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 6bc1e4d5c769f..f4783ae366ef0 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,5 +1,5 @@
 import enum
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
@@ -9,6 +9,9 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_utils import BlockHashType
+
 
 class Request:
 
@@ -45,6 +48,7 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
+        # Multi-modal input metadata.
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
             # FIXME(woosuk): Support other modalities.
@@ -52,10 +56,15 @@ def __init__(
         else:
             self.mm_positions = []
         # Output of the mm input mapper (e.g., image tensors).
+        self.mm_inputs: List[MultiModalKwargs] = []
         if self.inputs.multi_modal_inputs:
             self.mm_inputs = self.inputs.multi_modal_inputs
-        else:
-            self.mm_inputs: List[MultiModalKwargs] = []
+
+        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+
+        # Cache the computed kv block hashes of the request to avoid
+        # recomputing.
+        self._kv_block_hashes: List[BlockHashType] = []
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
@@ -66,6 +75,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                 prompt=request.prompt,
                 multi_modal_data=None,
                 multi_modal_inputs=request.mm_inputs,
+                multi_modal_hashes=request.mm_hashes,
                 multi_modal_placeholders=request.mm_placeholders,
                 mm_processor_kwargs=None,
             ),
@@ -122,6 +132,17 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
+    @property
+    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
+        # Prevent directly appending to the kv_block_hashes.
+        return ConstantList(self._kv_block_hashes)
+
+    def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
+        self._kv_block_hashes = value
+
+    def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
+        self._kv_block_hashes.append(block_hash)
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6e7a7d4fe12cd..5f327d7066830 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,5 +1,8 @@
+from collections import OrderedDict
+from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Any, Generic, Iterator, List, TypeVar, overload
+from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
+                    overload)
 
 import zmq
 
@@ -10,7 +13,7 @@
 T = TypeVar("T")
 
 
-class ConstantList(Generic[T]):
+class ConstantList(Generic[T], Sequence):
 
     def __init__(self, x: List[T]) -> None:
         self._x = x
@@ -33,29 +36,33 @@ def remove(self, item):
     def clear(self):
         raise Exception("Cannot clear a constant list")
 
-    def index(self, item):
-        return self._x.index(item)
+    def index(self,
+              item: T,
+              start: int = 0,
+              stop: Optional[int] = None) -> int:
+        return self._x.index(item, start,
+                             stop if stop is not None else len(self._x))
 
     @overload
-    def __getitem__(self, item) -> T:
+    def __getitem__(self, item: int) -> T:
         ...
 
     @overload
     def __getitem__(self, s: slice, /) -> List[T]:
         ...
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]:
         return self._x[item]
 
     @overload
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: int, value: T):
         ...
 
     @overload
-    def __setitem__(self, s: slice, value, /):
+    def __setitem__(self, s: slice, value: T, /):
         ...
 
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]):
         raise Exception("Cannot set item in a constant list")
 
     def __delitem__(self, item):
@@ -72,10 +79,12 @@ def __len__(self):
 
 
 @contextmanager
-def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
+def make_zmq_socket(
+        path: str,
+        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
     """Context manager for a ZMQ socket"""
 
-    ctx = zmq.Context()
+    ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
         socket = ctx.socket(type)
 
@@ -93,3 +102,27 @@ def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
 
     finally:
         ctx.destroy(linger=0)
+
+
+K = TypeVar('K')
+V = TypeVar('V')
+
+
+class LRUDictCache(Generic[K, V]):
+
+    def __init__(self, size: int):
+        self.cache: OrderedDict[K, V] = OrderedDict()
+        self.size = size
+
+    def get(self, key: K, default=None) -> V:
+        if key not in self.cache:
+            return default
+
+        self.cache.move_to_end(key)
+        return self.cache[key]
+
+    def put(self, key: K, value: V):
+        self.cache[key] = value
+        self.cache.move_to_end(key)
+        if len(self.cache) > self.size:
+            self.cache.popitem(last=False)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 25d95ac6e26af..5c113c74778df 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -53,14 +53,23 @@ def __init__(
         self.req_ids: List[Optional[str]] = [None] * max_num_reqs
         self.req_id_to_index: Dict[str, int] = {}
 
-        self.token_ids_cpu = np.empty((max_num_reqs, max_model_len),
-                                      dtype=np.int32)
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
-        self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req),
-                                       device=self.device,
-                                       dtype=torch.int32)
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
         self.block_table_cpu_tensor = torch.zeros(
             (max_num_reqs, max_num_blocks_per_req),
             device="cpu",
@@ -206,6 +215,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
             # Swap the states.
             req_id = self.req_ids[last_req_index]
+            assert req_id is not None
             self.req_ids[empty_index] = req_id
             self.req_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8d9976ded7c5e..c6fab5f05fcb3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Tuple, cast
 
 import numpy as np
 import torch
@@ -10,15 +10,16 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -35,7 +36,6 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         device: torch.device,
-        input_registry: InputRegistry = INPUT_REGISTRY,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -61,11 +61,13 @@ def __init__(
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
+        self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -75,7 +77,12 @@ def __init__(
         self.hidden_size = model_config.get_hidden_size()
 
         # Multi-modal data support
-        self.input_registry = input_registry
+        self.input_registry = INPUT_REGISTRY
+        self.mm_registry = MULTIMODAL_REGISTRY
+        # NOTE: mm_input_mapper is only used for memory profiling.
+        self.mm_input_mapper = MMInputMapperClient(self.model_config)
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
+        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
@@ -87,7 +94,7 @@ def __init__(
         self.requests: Dict[str, CachedRequestState] = {}
         # Persistent batch.
         self.input_batch = InputBatch(
-            max_num_reqs=self.scheduler_config.max_num_seqs,
+            max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
@@ -103,6 +110,11 @@ def __init__(
         # The batch sizes in the config are in descending order.
         self.cudagraph_batch_sizes = list(
             reversed(self.vllm_config.compilation_config.capture_sizes))
+
+        # Persistent buffers for CUDA graphs.
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=self.device)
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
@@ -111,6 +123,38 @@ def __init__(
             dtype=self.dtype,
             device=self.device)
 
+        # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len),
+                                   dtype=np.int32)
+        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
+        # a faster version of creating a new tensor every time. Thus, we should
+        # not make any assumptions about the values in these tensors.
+        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.input_ids_np = self.input_ids_cpu.numpy()
+        self.positions_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int64,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.positions_np = self.positions_cpu.numpy()
+        self.slot_mapping_cpu = torch.zeros(self.max_num_tokens,
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=self.pin_memory)
+        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
+        self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=self.pin_memory)
+        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
+        self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                             dtype=torch.int32,
+                                             device="cpu",
+                                             pin_memory=self.pin_memory)
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
@@ -160,9 +204,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
-        for req_data in scheduler_output.scheduled_new_reqs:
-            req_id = req_data.req_id
-            sampling_params = req_data.sampling_params
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
             if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
                 generator = torch.Generator(device=self.device)
                 generator.manual_seed(sampling_params.seed)
@@ -171,25 +215,25 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
-                prompt_token_ids=req_data.prompt_token_ids,
-                prompt=req_data.prompt,
-                mm_inputs=req_data.mm_inputs,
-                mm_positions=req_data.mm_positions,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                prompt=new_req_data.prompt,
+                mm_inputs=new_req_data.mm_inputs,
+                mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 generator=generator,
-                block_ids=req_data.block_ids,
-                num_computed_tokens=req_data.num_computed_tokens,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
             )
             req_ids_to_add.append(req_id)
 
         # Update the cached states of the resumed requests.
-        for req_data in scheduler_output.scheduled_resumed_reqs:
-            req_id = req_data.req_id
+        for res_req_data in scheduler_output.scheduled_resumed_reqs:
+            req_id = res_req_data.req_id
             req_state = self.requests[req_id]
 
-            req_state.block_ids = req_data.block_ids
-            req_state.num_computed_tokens = req_data.num_computed_tokens
+            req_state.block_ids = res_req_data.block_ids
+            req_state.num_computed_tokens = res_req_data.num_computed_tokens
             req_ids_to_add.append(req_id)
 
         # Add the new or resumed requests to the persistent batch.
@@ -226,6 +270,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         num_scheduled_tokens = []
         max_num_scheduled_tokens = 0
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_scheduled_tokens.append(num_tokens)
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
@@ -235,22 +280,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        indices = np.arange(num_reqs)
-        req_indices = np.repeat(indices, num_scheduled_tokens)
+        req_indices = np.repeat(self.arange_np[:num_reqs],
+                                num_scheduled_tokens)
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
-                                (num_reqs, 1))
-        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
-        arange = arange_matrix[mask]
+        arange = np.concatenate(
+            [self.arange_np[:n] for n in num_scheduled_tokens])
 
         # Get positions.
-        positions = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        positions_np = positions.numpy()
+        positions_np = self.positions_np[:total_num_scheduled_tokens]
         np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
                arange,
                out=positions_np)
@@ -261,16 +300,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # where M is the max_model_len.
         token_indices = (positions_np +
                          req_indices * self.input_batch.token_ids_cpu.shape[1])
-        token_indices = torch.from_numpy(token_indices)
-        input_ids = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        torch.index_select(torch.from_numpy(
-            self.input_batch.token_ids_cpu).flatten(),
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
                            0,
-                           token_indices,
-                           out=input_ids)
+                           torch.from_numpy(token_indices),
+                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
 
         # Calculate the slot mapping.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
@@ -278,44 +314,40 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # where K is the max_num_blocks_per_req and the block size is 2.
         # NOTE(woosuk): We can't simply use `token_indices // block_size` here
         # because M (max_model_len) is not necessarily divisible by block_size.
-        block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[
-            req_indices * self.max_num_blocks_per_req +
-            positions_np // self.block_size]
-        block_offsets = torch.from_numpy(positions_np % self.block_size)
-        slot_mapping = torch.empty((total_num_scheduled_tokens, ),
-                                   dtype=torch.int32,
-                                   device="cpu",
-                                   pin_memory=self.pin_memory)
-        torch.add(block_numbers * self.block_size,
-                  block_offsets,
-                  out=slot_mapping)
+        block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                               positions_np // self.block_size)
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        block_numbers = (self.input_batch.block_table_cpu_tensor.flatten()
+                         [block_table_indices].numpy())
+        block_offsets = positions_np % self.block_size
+        np.add(block_numbers * self.block_size,
+               block_offsets,
+               out=self.slot_mapping_np[:total_num_scheduled_tokens])
 
         # Prepare the attention metadata.
-        query_start_loc = torch.empty((num_reqs + 1, ),
-                                      dtype=torch.int32,
-                                      device="cpu",
-                                      pin_memory=self.pin_memory)
-        query_start_loc_np = query_start_loc.numpy()
-        query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+        self.query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens,
+                  out=self.query_start_loc_np[1:num_reqs + 1])
 
         seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
                     num_scheduled_tokens)
         max_seq_len = seq_lens.max()
-        seq_start_loc = torch.empty((num_reqs + 1, ),
-                                    dtype=torch.int32,
-                                    device="cpu",
-                                    pin_memory=self.pin_memory)
-        seq_start_loc_np = seq_start_loc.numpy()
-        seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens, out=seq_start_loc_np[1:])
-
-        input_ids = input_ids.to(self.device, non_blocking=True)
-        self.positions[:total_num_scheduled_tokens].copy_(positions,
-                                                          non_blocking=True)
-        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
-        seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
-        slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
+
+        # Copy the tensors to the GPU.
+        self.input_ids[:total_num_scheduled_tokens].copy_(
+            self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        self.positions[:total_num_scheduled_tokens].copy_(
+            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
+            self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
@@ -331,7 +363,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return input_ids, attn_metadata, logits_indices
+        return attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -355,7 +387,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
 
         # Batch the multi-modal inputs.
         mm_inputs: List[MultiModalKwargs] = []
-        req_input_ids: List[Tuple[int, int]] = []
+        req_input_ids: List[Tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
             for input_id in encoder_input_ids:
@@ -388,6 +420,7 @@ def _gather_encoder_outputs(
         encoder_outputs: List[torch.Tensor] = []
         num_reqs = self.input_batch.num_reqs
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
             req_state = self.requests[req_id]
@@ -427,46 +460,58 @@ def execute_model(
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
 
-        # Run the encoder.
-        self._execute_encoder(scheduler_output)
-        encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        else:
+            encoder_outputs = []
 
         # Prepare the decoder inputs.
-        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
-            scheduler_output)
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
             # Use piecewise CUDA graphs.
             # Add padding to the batch size.
-            num_input_tokens = self._get_padded_batch_size(
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 num_scheduled_tokens)
         else:
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
-
         attn_metadata.num_input_tokens = num_input_tokens
 
-        # Get the inputs embeds.
-        if encoder_outputs:
-            inputs_embeds = self.model.get_input_embeddings(
-                input_ids, encoder_outputs)
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            input_ids = self.input_ids[:num_scheduled_tokens]
+            if encoder_outputs:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, encoder_outputs)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            # TODO(woosuk): Avoid the copy. Optimize.
+            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
         else:
-            inputs_embeds = self.model.get_input_embeddings(input_ids)
-        # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
-        # always use embeddings (rather than token ids) as input to the model.
-        # TODO(woosuk): Avoid the copy. Optimize.
-        self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
 
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
-                input_ids=None,
+                input_ids=input_ids,
                 positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
-                inputs_embeds=self.inputs_embeds[:num_input_tokens],
+                inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
@@ -484,6 +529,7 @@ def execute_model(
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
         for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+            assert req_id is not None
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
@@ -509,8 +555,15 @@ def execute_model(
             logprobs = None
         else:
             logprobs = sampler_output.logprobs.cpu()
+
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
         model_runner_output = ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids[:num_reqs],
+            req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
             logprob_token_ids_cpu=logprob_token_ids,
@@ -534,18 +587,23 @@ def _dummy_run(
         num_tokens: int,
         kv_caches: List[torch.Tensor],
     ) -> torch.Tensor:
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = self.inputs_embeds[:num_tokens]
+        else:
+            input_ids = self.input_ids[:num_tokens]
+            inputs_embeds = None
         with set_forward_context(None, self.vllm_config):
             hidden_states = model(
-                input_ids=None,
+                input_ids=input_ids,
                 positions=self.positions[:num_tokens],
                 kv_caches=kv_caches,
                 attn_metadata=None,
-                inputs_embeds=self.inputs_embeds[:num_tokens])
+                inputs_embeds=inputs_embeds,
+            )
         return hidden_states
 
     def profile_run(self) -> None:
-        # TODO(woosuk): Profile the max memory usage of the encoder and
-        # the encoder cache.
         # use an empty tensor instead of `None`` to force Dynamo to pass
         # it by reference, rather by specializing on the value `None`.
         # the `dtype` argument does not matter, and we use `float32` as
@@ -557,6 +615,57 @@ def profile_run(self) -> None:
             torch.tensor([], dtype=torch.float32, device=self.device)
             for _ in range(self.num_attn_layers)
         ]
+
+        # Profile with multimodal encoder & encoder cache.
+        # TODO (ywang96): generalize this beyond image modality since
+        # mm_input_mapper only supports image inputs.
+        if self.is_multimodal_model:
+
+            # Create dummy batch of multimodal inputs.
+            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+                model_config=self.model_config,
+                seq_len=self.max_num_tokens,
+                mm_registry=self.mm_registry,
+            )
+            dummy_mm_data = dummy_request_data.multi_modal_data
+            dummy_mm_kwargs, _ = self.mm_input_mapper.process_inputs(
+                mm_data=dummy_mm_data,
+                mm_hashes=None,
+                mm_processor_kwargs=None,
+                precomputed_mm_inputs=None)
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality even when it supports multiple.
+            max_tokens_per_mm_item = max(
+                self.mm_registry.get_max_tokens_per_item_by_modality(
+                    self.model_config).values())
+
+            max_num_mm_items = min(
+                self.max_num_encoder_input_tokens,
+                self.encoder_cache_size) // max_tokens_per_mm_item
+
+            # Dummy data definition in V0 may contain multiple multimodal items
+            # (e.g, multiple images) for a single request, therefore here we
+            # always replicate first item by max_num_mm_items times since in V1
+            # they are scheduled to be processed separately.
+            batched_dummy_mm_inputs = MultiModalKwargs.batch(
+                [dummy_mm_kwargs[0]] * max_num_mm_items)
+            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_dummy_mm_inputs, device=self.device)
+
+            # Run multimodal encoder.
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
         # Trigger compilation for general shape.
         hidden_states = self._dummy_run(self.model, self.max_num_tokens,
                                         dummy_kv_caches)
@@ -565,6 +674,7 @@ def profile_run(self) -> None:
         # TODO(woosuk): Consider the memory usage of the sampler.
         torch.cuda.synchronize()
         del hidden_states, logits
+        self.encoder_cache.clear()
         gc.collect()
 
     def capture_model(self) -> None:
@@ -604,10 +714,3 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
-
-    def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
-        # TODO: Optimize this?
-        for size in self.cudagraph_batch_sizes:
-            if batch_size <= size:
-                return size
-        return None
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 49e415ab72e0b..33491f700de10 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -204,7 +204,7 @@ def execute_model(
         return output if self.rank == 0 else None
         return output
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
         if is_start:
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 5697fbbaa2041..bff01320d7927 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -464,7 +464,7 @@ def _prepare_encoder_model_input_tensors(
                 # We will be using CUDA graph replay for this decode.
                 max_len_of_block_table = self.get_max_block_per_batch()
                 batch_size = len(encoder_seq_lens)
-                graph_batch_size = self.vllm_config.get_graph_batch_size(
+                graph_batch_size = self.vllm_config.pad_for_cudagraph(
                     batch_size)
                 assert graph_batch_size >= batch_size
                 cuda_graph_pad_size = graph_batch_size - batch_size
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 99cf9a7e67256..9d479f412af46 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -622,6 +622,10 @@ def load_model(self) -> None:
                 assert hasattr(
                     self.model, "embedding_padding_modules"
                 ), "Model does not have embedding_padding_modules"
+                assert not self.lora_config.bias_enabled, \
+                    "Bias support in LoRA is not enabled in HPU yet."
+                assert not self.lora_config.fully_sharded_loras, \
+                    "Fully sharded LoRAs is not enabled in HPU yet."
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
                     self.scheduler_config.max_num_batched_tokens,
@@ -1282,11 +1286,9 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1]
-        max_seq_len = min(
-            self.bucketing_global_state.prompt_seq_bucket_cfg[-1],
-            self.max_num_batched_tokens // max_batch_size)
-
+        max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
+        max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
+                             self.scheduler_config.max_num_seqs)
         self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
                              False, True)
         return
@@ -1304,7 +1306,6 @@ def warmup_scenario(self,
                          f"bs{batch_size}_"
                          f"seq{seq_len}_"
                          f"graphs{'T' if use_graphs else 'F'}")
-        max_num_seqs = self.scheduler_config.max_num_seqs
         # This represents the maximum number of different requests
         # that will have unique loras, an therefore the max amount of memory
         # consumption create dummy lora request copies from the lora request
@@ -1326,16 +1327,10 @@ def warmup_scenario(self,
                     dummy_lora_requests.append(dummy_lora_request)
                 dummy_lora_requests_per_seq = [
                     dummy_lora_requests[idx % len(dummy_lora_requests)]
-                    for idx in range(max_num_seqs)
+                    for idx in range(batch_size)
                 ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs or is_pt_profiler_run else 1
-        if self.lora_config and not is_lora_profile_run:
-            lora_mapping = LoRAMapping(
-                **dict(index_mapping=[0] * batch_size * seq_len,
-                       prompt_mapping=[0] * batch_size * seq_len,
-                       is_prefill=is_prompt))
-            self.set_active_loras(set(), lora_mapping)
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 551b84435fdc0..6ff98a8f1bab2 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -802,7 +802,8 @@ def _get_cuda_graph_pad_size(self,
                                         max_encoder_seq_len):
             return -1
 
-        graph_batch_size = VllmConfig.get_graph_batch_size(batch_size)
+        graph_batch_size = self.runner.vllm_config.pad_for_cudagraph(
+            batch_size)
         assert graph_batch_size >= batch_size
         return graph_batch_size - batch_size
 
@@ -1014,8 +1015,8 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size(
-            self.scheduler_config.max_num_seqs)
+        self.max_batchsize_to_capture = \
+            self.vllm_config.compilation_config.max_capture_size
 
         self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
             {} for _ in range(self.parallel_config.pipeline_parallel_size)
@@ -1162,7 +1163,8 @@ def load_model(self) -> None:
 
         if self.vllm_config.compilation_config.level ==\
             CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
-            backend = self.vllm_config.compilation_config.init_backend()
+            backend = self.vllm_config.compilation_config.init_backend(
+                self.vllm_config)
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 5188d60e5846c..871125cfd6c21 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -645,7 +645,8 @@ def _advance_step(self, model_input: StatefulModelInput,
         return model_input
 
     def load_model(self) -> None:
-        return self._base_model_runner.load_model()
+        self._base_model_runner.load_model()
+        self.model_memory_usage = self._base_model_runner.model_memory_usage
 
     def save_sharded_state(
         self,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index a368bb9ee9a5b..f51b51d433d3d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,7 +1,6 @@
 """A GPU worker class."""
 import gc
 import os
-import time
 from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
@@ -22,6 +21,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.utils import GiB_bytes, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -192,33 +192,22 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         torch.cuda.reset_peak_memory_stats()
 
         free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
-        start_time = time.time()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
-        torch.cuda.synchronize()
+        with memory_profiling(baseline_memory_in_bytes=total_gpu_memory -
+                              self.init_gpu_memory,
+                              weights_memory_in_bytes=self.model_runner.
+                              model_memory_usage) as result:
+            self.model_runner.profile_run()
+            torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 
-        # Get the peak memory allocation recorded by torch
-        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
-        torch.cuda.empty_cache()
-        torch_allocated_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        total_allocated_bytes = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        if non_torch_allocations > 0:
-            peak_memory += non_torch_allocations
-
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
+        memory_for_current_instance = total_gpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        available_kv_cache_memory = (memory_for_current_instance -
+                                     result.non_kv_cache_memory_in_bytes)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
@@ -233,24 +222,23 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
 
-        end_time = time.time()
-        logger.info(
-            "Memory profiling results: "
-            "duration=%.2f seconds, "
-            "total_gpu_memory=%.2fGiB, "
-            "initial_memory_usage=%.2fGiB, "
-            "peak_torch_memory=%.2fGiB, "
-            "memory_usage_post_profile=%.2fGiB, "
-            "non_torch_memory=%.2fGiB, "
-            "kv_cache_size=%.2fGiB, "
-            "gpu_memory_utilization=%.2f.", end_time - start_time,
-            total_gpu_memory / (1024**3),
-            (total_gpu_memory - free_memory_pre_profile) / (1024**3),
-            (peak_memory - non_torch_allocations) / (1024**3),
-            total_allocated_bytes / (1024**3),
-            non_torch_allocations / (1024**3),
-            available_kv_cache_memory / (1024**3),
-            self.cache_config.gpu_memory_utilization)
+        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
+               "the current vLLM instance can use "
+               "total_gpu_memory "
+               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
+               " x gpu_memory_utilization "
+               f"({self.cache_config.gpu_memory_utilization:.2f})"
+               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
+               "model weights take "
+               f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
+               " non_torch_memory takes "
+               f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " PyTorch activation peak memory takes "
+               f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " the rest of the memory reserved for KV Cache is "
+               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
+
+        logger.info(msg)
 
         # Final cleanup
         if self.model_runner.lora_manager:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e6322e095bbb9..9cf25387560da 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -37,10 +37,6 @@
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = -1
-_BATCH_SIZE_ALIGNMENT = 8
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
-]
 
 TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU")