From 91642db952458fbb6ae7c2d167757dc86b105991 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 11 Dec 2024 10:43:05 -0800
Subject: [PATCH 01/85] [torch.compile] use depyf to dump torch.compile
 internals (#10972)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 requirements-common.txt        |  1 +
 vllm/compilation/backends.py   | 69 ++++++++++++++++++----------------
 vllm/compilation/decorators.py |  2 +-
 vllm/compilation/monitor.py    | 23 ++++++++++--
 vllm/compilation/wrapper.py    |  4 +-
 vllm/config.py                 |  6 ++-
 vllm/worker/model_runner.py    |  3 +-
 7 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 792cd58e80669..850b8f4101701 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -33,3 +33,4 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
 compressed-tensors == 0.8.0 # required for compressed-tensors
+depyf==0.18.0 # required for profiling and debugging torch.compile
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index f002a8ff905b1..09a3daa731829 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -9,7 +9,7 @@
 import torch.fx as fx
 
 import vllm.envs as envs
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import weak_ref_tensors
 
@@ -149,14 +149,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     """
 
     def __init__(self, module: torch.fx.GraphModule,
-                 compile_submod_names: List[str],
-                 compilation_configs: CompilationConfig, graph_pool):
+                 compile_submod_names: List[str], vllm_config: VllmConfig,
+                 graph_pool):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
         self.compile_submod_names = compile_submod_names
-        self.compilation_configs = compilation_configs
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
+        self.vllm_config = vllm_config
 
     def run(self, *args):
         fake_args = [
@@ -182,15 +183,15 @@ def call_module(self, target: torch.fx.node.Target,
             compiled_graph_for_general_shape = wrap_inductor(
                 submod,
                 args,
-                self.compilation_configs.inductor_compile_config,
-                self.compilation_configs,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
 
             self.module.__dict__[target] = PiecewiseBackend(
-                submod, self.compilation_configs, self.graph_pool, index,
+                submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_general_shape)
 
@@ -211,7 +212,8 @@ class VllmBackend:
     which handles the post-grad passes.
     """
 
-    compilation_configs: CompilationConfig
+    vllm_config: VllmConfig
+    compilation_config: CompilationConfig
     graph_pool: Any
     _called: bool = False
     # the graph we compiled
@@ -227,7 +229,7 @@ class VllmBackend:
 
     def __init__(
         self,
-        compilation_configs: CompilationConfig,
+        vllm_config: VllmConfig,
     ):
         global global_graph_pool
         if global_graph_pool is None:
@@ -244,13 +246,14 @@ def __init__(
         self.sym_tensor_indices = []
         self.input_buffers = []
 
-        self.compilation_configs = compilation_configs
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
     def configure_post_pass(self):
-        config = self.compilation_configs
+        config = self.compilation_config
         self.post_grad_pass_manager.configure(config.pass_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
@@ -271,7 +274,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         from .monitor import torch_compile_start_time
         dynamo_time = time.time() - torch_compile_start_time
         logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
-        self.compilation_configs.compilation_time += dynamo_time
+        self.compilation_config.compilation_time += dynamo_time
 
         # we control the compilation process, each instance can only be
         # called once
@@ -281,7 +284,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         self.configure_post_pass()
 
         self.split_gm, self.piecewise_graphs = split_graph(
-            graph, self.compilation_configs.splitting_ops)
+            graph, self.compilation_config.splitting_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
         logger.debug("%s", lazy_format_graph_code("before split", self.graph))
@@ -298,13 +301,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.compilation_configs,
+                                    self.vllm_config,
                                     self.graph_pool).run(*example_inputs)
 
         self._called = True
 
-        if not self.compilation_configs.use_cudagraph or \
-            not self.compilation_configs.cudagraph_copy_inputs:
+        if not self.compilation_config.use_cudagraph or \
+            not self.compilation_config.cudagraph_copy_inputs:
             return self.split_gm
 
         # if we need to copy input buffers for cudagraph
@@ -364,10 +367,9 @@ class ConcreteSizeEntry:
 
 class PiecewiseBackend:
 
-    def __init__(self, graph: fx.GraphModule,
-                 compilation_configs: CompilationConfig, graph_pool: Any,
-                 piecewise_compile_index: int, total_piecewise_compiles: int,
-                 sym_shape_indices: List[int],
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: List[int],
                  compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
@@ -375,7 +377,7 @@ def __init__(self, graph: fx.GraphModule,
 
         We will compile `self.graph` once for the general shape,
         and then compile for different shapes specified in
-        `compilation_configs.compile_sizes`.
+        `compilation_config.compile_sizes`.
 
         Independently, we will capture cudagraph for different shapes.
 
@@ -383,7 +385,8 @@ def __init__(self, graph: fx.GraphModule,
         compile it first, and then capture cudagraph.
         """
         self.graph = graph
-        self.compilation_configs = compilation_configs
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
@@ -393,10 +396,10 @@ def __init__(self, graph: fx.GraphModule,
             piecewise_compile_index == total_piecewise_compiles - 1)
 
         self.compile_sizes: Set[int] = set(
-            self.compilation_configs.compile_sizes)
+            self.compilation_config.compile_sizes)
         self.capture_sizes: Set[int] = set(
-            self.compilation_configs.capture_sizes
-        ) if self.compilation_configs.use_cudagraph else set()
+            self.compilation_config.capture_sizes
+        ) if self.compilation_config.use_cudagraph else set()
 
         self.first_run_finished = False
 
@@ -423,7 +426,7 @@ def __call__(self, *args) -> Any:
             self.first_run_finished = True
             # no specific sizes to compile
             if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.compilation_configs)
+                end_monitoring_torch_compile(self.vllm_config)
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -443,28 +446,28 @@ def __call__(self, *args) -> Any:
             entry.runnable = wrap_inductor(
                 self.graph,
                 args,
-                self.compilation_configs.inductor_compile_config,
-                self.compilation_configs,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.compilation_configs)
+                end_monitoring_torch_compile(self.vllm_config)
 
         if not entry.use_cudagraph:
             return entry.runnable(*args)
 
         if entry.cudagraph is None:
-            if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups:  # noqa
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
                 entry.num_finished_warmup += 1
                 if self.is_first_graph:
                     logger.debug(
                         "Warming up %s/%s for shape %s",
                         entry.num_finished_warmup,
-                        self.compilation_configs.cudagraph_num_of_warmups,
+                        self.compilation_config.cudagraph_num_of_warmups,
                         runtime_shape)
                 return entry.runnable(*args)
 
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 938430fe2a501..805a217ee6ca1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -185,7 +185,7 @@ def __call__(self, *args, **kwargs):
                             "Unsupported dynamic dimensions"
                             f" {dims} for argument {k} with type {type(arg)}.")
             # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config.compilation_config)
+            start_monitoring_torch_compile(self.vllm_config)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 3348674b09af2..b97e40415b41b 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,19 +1,36 @@
+import os
 import time
 
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
+context_manager = None
 torch_compile_start_time: float = 0.0
 
 
-def start_monitoring_torch_compile(compilation_config: CompilationConfig):
+def start_monitoring_torch_compile(vllm_config: VllmConfig):
     global torch_compile_start_time
     torch_compile_start_time = time.time()
 
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE and \
+        compilation_config.debug_dump_path:
+        import depyf
+        path = os.path.join(compilation_config.debug_dump_path,
+                            f"rank_{vllm_config.parallel_config.rank}")
+        global context_manager
+        context_manager = depyf.prepare_debug(path)
+        context_manager.__enter__()
 
-def end_monitoring_torch_compile(compilation_config: CompilationConfig):
+
+def end_monitoring_torch_compile(vllm_config: VllmConfig):
+    compilation_config: CompilationConfig = vllm_config.compilation_config
     if compilation_config.level == CompilationLevel.PIECEWISE:
         logger.info("torch.compile takes %.2f s in total",
                     compilation_config.compilation_time)
+        global context_manager
+        if context_manager is not None:
+            context_manager.__exit__(None, None, None)
+            context_manager = None
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index bc4d292fef402..c10241b483169 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -32,8 +32,8 @@ def __init__(self,
             # default compilation settings
             # compiling the forward method
 
-            backend = get_current_vllm_config(
-            ).compilation_config.init_backend()
+            vllm_config = get_current_vllm_config()
+            backend = vllm_config.compilation_config.init_backend(vllm_config)
 
             compiled_callable = torch.compile(
                 self.forward,
diff --git a/vllm/config.py b/vllm/config.py
index 322c8f8990a40..7f9be5a3a98bc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2222,6 +2222,7 @@ class CompilationConfig(BaseModel):
             - 1: dynamo as is.
             - 2: dynamo once.
             - 3: piecewise compilation.
+        - debug_dump_path: the path to dump the debug information.
         - backend: the backend for compilation. It needs to be a string.
             - "" (empty string): use the default backend.
             - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
@@ -2289,6 +2290,7 @@ class CompilationConfig(BaseModel):
         certain small batchsizes, where inductor is good at optimizing.
     """ # noqa
     level: int = 0
+    debug_dump_path: str = ""
     backend: str = ""
     custom_ops: List[str] = Field(default_factory=list)
     splitting_ops: List[str] = Field(default_factory=lambda: [
@@ -2394,7 +2396,7 @@ def model_post_init(self, __context: Any) -> None:
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
-    def init_backend(self) -> Union[str, Callable]:
+    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
@@ -2413,7 +2415,7 @@ def init_backend(self) -> Union[str, Callable]:
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
         from vllm.compilation.backends import VllmBackend
-        return VllmBackend(self)
+        return VllmBackend(vllm_config)
 
     def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
         """To complete the initialization of config,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 551b84435fdc0..26fd486130ce6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1162,7 +1162,8 @@ def load_model(self) -> None:
 
         if self.vllm_config.compilation_config.level ==\
             CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
-            backend = self.vllm_config.compilation_config.init_backend()
+            backend = self.vllm_config.compilation_config.init_backend(
+                self.vllm_config)
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,

From d643c2aba1cd5421200f3a3bad1813dd067233b4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 11 Dec 2024 10:49:23 -0800
Subject: [PATCH 02/85] [V1] Use input_ids as input for text-only models 
 (#11032)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 68 +++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8d9976ded7c5e..e75be21ef2d91 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -61,6 +61,7 @@ def __init__(
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
+        self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
@@ -103,6 +104,11 @@ def __init__(
         # The batch sizes in the config are in descending order.
         self.cudagraph_batch_sizes = list(
             reversed(self.vllm_config.compilation_config.capture_sizes))
+
+        # Persistent buffers for CUDA graphs.
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=self.device)
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
@@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         seq_start_loc_np[0] = 0
         np.cumsum(seq_lens, out=seq_start_loc_np[1:])
 
-        input_ids = input_ids.to(self.device, non_blocking=True)
+        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
+                                                          non_blocking=True)
         self.positions[:total_num_scheduled_tokens].copy_(positions,
                                                           non_blocking=True)
         query_start_loc = query_start_loc.to(self.device, non_blocking=True)
@@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return input_ids, attn_metadata, logits_indices
+        return attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -427,13 +434,15 @@ def execute_model(
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
 
-        # Run the encoder.
-        self._execute_encoder(scheduler_output)
-        encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        else:
+            encoder_outputs = []
 
         # Prepare the decoder inputs.
-        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
-            scheduler_output)
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -444,29 +453,39 @@ def execute_model(
         else:
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
-
         attn_metadata.num_input_tokens = num_input_tokens
 
-        # Get the inputs embeds.
-        if encoder_outputs:
-            inputs_embeds = self.model.get_input_embeddings(
-                input_ids, encoder_outputs)
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            input_ids = self.input_ids[:num_scheduled_tokens]
+            if encoder_outputs:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, encoder_outputs)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            # TODO(woosuk): Avoid the copy. Optimize.
+            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
         else:
-            inputs_embeds = self.model.get_input_embeddings(input_ids)
-        # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
-        # always use embeddings (rather than token ids) as input to the model.
-        # TODO(woosuk): Avoid the copy. Optimize.
-        self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
 
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
-                input_ids=None,
+                input_ids=input_ids,
                 positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
-                inputs_embeds=self.inputs_embeds[:num_input_tokens],
+                inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
@@ -534,13 +553,20 @@ def _dummy_run(
         num_tokens: int,
         kv_caches: List[torch.Tensor],
     ) -> torch.Tensor:
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = self.inputs_embeds[:num_tokens]
+        else:
+            input_ids = self.input_ids[:num_tokens]
+            inputs_embeds = None
         with set_forward_context(None, self.vllm_config):
             hidden_states = model(
-                input_ids=None,
+                input_ids=input_ids,
                 positions=self.positions[:num_tokens],
                 kv_caches=kv_caches,
                 attn_metadata=None,
-                inputs_embeds=self.inputs_embeds[:num_tokens])
+                inputs_embeds=inputs_embeds,
+            )
         return hidden_states
 
     def profile_run(self) -> None:

From 66aaa7722df3d7ef9e9bd2942cab5cd0d7473174 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 11 Dec 2024 10:59:50 -0800
Subject: [PATCH 03/85] [torch.compile] remove graph logging in ci (#11110)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 09a3daa731829..4a5dc337d01b8 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -287,9 +287,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             graph, self.compilation_config.splitting_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
-        logger.debug("%s", lazy_format_graph_code("before split", self.graph))
-        logger.debug("%s", lazy_format_graph_code("after split",
-                                                  self.split_gm))
+
+        # depyf will hook lazy_format_graph_code and dump the graph
+        # for debugging, no need to print the graph here
+        lazy_format_graph_code("before split", self.graph)
+        lazy_format_graph_code("after split", self.split_gm)
 
         compilation_counter.num_piecewise_graphs_seen += len(
             self.piecewise_graphs)

From 72ff3a968682e6a3f7620ab59f2baf5e8eb2777b Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 11 Dec 2024 11:36:35 -0800
Subject: [PATCH 04/85] [core] Bump ray to use _overlap_gpu_communication in
 compiled graph tests (#10410)

Signed-off-by: Rui Qiao <ubuntu@ip-172-31-15-128.us-west-2.compute.internal>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Rui Qiao <ubuntu@ip-172-31-15-128.us-west-2.compute.internal>
---
 requirements-test.in              |  2 +-
 requirements-test.txt             |  2 +-
 vllm/envs.py                      |  8 ++++++++
 vllm/executor/ray_gpu_executor.py | 17 ++++++++++-------
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/requirements-test.in b/requirements-test.in
index c0b228148ab31..57fddb416317e 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -13,7 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 peft
-ray[adag]==2.35
+ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
diff --git a/requirements-test.txt b/requirements-test.txt
index 8ceb705cdffd7..c786a1249bddb 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -410,7 +410,7 @@ pyyaml==6.0.2
     #   ray
     #   timm
     #   transformers
-ray[adag]==2.35.0
+ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
     # via tensorizer
diff --git a/vllm/envs.py b/vllm/envs.py
index be5d9985b63a4..bc8c1499e9534 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,6 +45,7 @@
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -337,6 +338,13 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                  ),
 
+    # If the env var is set, it enables GPU communication overlap in
+    # Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1"))
+                 ),
+
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4263fb27265f6..4bf5cbbd18ffe 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -414,12 +414,10 @@ def _check_ray_adag_installation(self):
         import pkg_resources
         from packaging import version
 
-        required_version = version.parse("2.35")
+        required_version = version.parse("2.40")
         current_version = version.parse(
             pkg_resources.get_distribution("ray").version)
-        # TODO: update the constraint once we adapt to the backward
-        # incompatible API change from ray 2.36
-        if current_version != required_version:
+        if current_version < required_version:
             raise ValueError(f"Ray version {required_version} is "
                              f"required, but found {current_version}")
 
@@ -445,6 +443,8 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
         logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
         with InputNode() as input_data:
             # Example DAG: PP=2, TP=4
             # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
@@ -480,7 +480,10 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
             forward_dag = MultiOutputNode(outputs)
 
-        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.
+            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
 
     def __del__(self):
         self.shutdown()
@@ -507,8 +510,8 @@ async def execute_model_async(
 
         serialized_data = self.input_encoder.encode(execute_model_req)
         dag_future = await self.forward_dag.execute_async(serialized_data)
-        outputs = await dag_future
-        return self.output_decoder.decode(outputs[0])
+        output = await dag_future[0]
+        return self.output_decoder.decode(output)
 
     async def _driver_execute_model_async(
         self,

From d1e21a979bba4712f48dac1bbf410e0b57c92e7a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 12 Dec 2024 06:18:16 +0800
Subject: [PATCH 05/85] [CI/Build] Split up VLM tests (#11083)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 | 32 ++++++---
 pyproject.toml                                |  3 +-
 .../vision_language/test_models.py            | 72 ++++++++++++-------
 tests/utils.py                                | 37 ++++++----
 4 files changed, 94 insertions(+), 50 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index df4fa7a6ee9ba..aca505178df06 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -321,7 +321,7 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 30min
+- label: Basic Models Test # 24min
   source_file_dependencies:
   - vllm/
   - tests/models
@@ -331,7 +331,7 @@ steps:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
-- label: Language Models Test (Standard) # 42min
+- label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -342,7 +342,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
 
-- label: Language Models Test (Extended) # 50min
+- label: Language Models Test (Extended) # 1h10min
   optional: true
   source_file_dependencies:
   - vllm/
@@ -353,7 +353,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 26min
+- label: Multi-Modal Models Test (Standard) # 28min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -369,7 +369,7 @@ steps:
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) # 1h15m
+- label: Multi-Modal Models Test (Extended) 1 # 1h16m
   optional: true
   source_file_dependencies:
   - vllm/
@@ -380,14 +380,24 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
   optional: true
@@ -446,11 +456,11 @@ steps:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
@@ -540,7 +550,7 @@ steps:
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: LM Eval Large Models # optional
diff --git a/pyproject.toml b/pyproject.toml
index 253b706a774a7..c5a14ecf5aea9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,7 +96,8 @@ markers = [
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
     "quant_model: run this model test under Quantized category",
-    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
+    "split: run this test as part of a split",
+    "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index ed8f34a677f84..3101d1d2ea831 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -1,7 +1,9 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+import math
 import os
+from collections import defaultdict
 from pathlib import PosixPath
 from typing import Type
 
@@ -10,11 +12,12 @@
 from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless, identity
+from vllm.utils import identity
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                           _VideoAssets)
-from ....utils import fork_new_process_for_each_test, large_gpu_mark
+from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+                       multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
@@ -382,7 +385,7 @@
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
     ### Tensor parallel / multi-gpu broadcast tests
-    "broadcast-chameleon": VLMTestInfo(
+    "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
@@ -393,43 +396,25 @@
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
         hf_output_post_proc = lambda hf_output, model: hf_output[:2],
         comparator=check_outputs_equal,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            ),
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava": VLMTestInfo(
+    "llava-broadcast": VLMTestInfo(
         models=["llava-hf/llava-1.5-7b-hf"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava_next": VLMTestInfo(
+    "llava_next-broadcast": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
     ### Custom input edge-cases for specific models
@@ -468,6 +453,41 @@
 # yapf: enable
 
 
+def _mark_splits(
+    test_settings: dict[str, VLMTestInfo],
+    *,
+    num_groups: int,
+) -> dict[str, VLMTestInfo]:
+    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
+    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
+
+    for info in test_settings.values():
+        for model in info.models:
+            test_infos_by_model[model].append(info)
+
+    models = sorted(test_infos_by_model.keys())
+    split_size = math.ceil(len(models) / num_groups)
+
+    new_test_settings = dict[str, VLMTestInfo]()
+
+    for i in range(num_groups):
+        models_in_group = models[i * split_size:(i + 1) * split_size]
+
+        for model in models_in_group:
+            for info in test_infos_by_model[model]:
+                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
+                new_info = info._replace(marks=new_marks)
+                new_test_settings[name_by_test_info_id[id(info)]] = new_info
+
+    missing_keys = test_settings.keys() - new_test_settings.keys()
+    assert not missing_keys, f"Missing keys: {missing_keys}"
+
+    return new_test_settings
+
+
+VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
+
+
 ### Test wrappers
 # Wrappers around the core test running func for:
 # - single image
diff --git a/tests/utils.py b/tests/utils.py
index a893667e144a6..afeb708f3bcdc 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -682,10 +682,12 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
 
 
 def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
-    """Gets a pytest skipif mark, which triggers ig the the device doesn't have
-    meet a minimum memory requirement in gb; can be leveraged via 
-    @large_gpu_test to skip tests in environments without enough resources, or
-    called when filtering tests to run directly.
+    """
+    Get a pytest mark, which skips the test if the GPU doesn't meet
+    a minimum memory requirement in GB.
+    
+    This can be leveraged via `@large_gpu_test` to skip tests in environments
+    without enough resources, or called when filtering tests to run directly.
     """
     try:
         if current_platform.is_cpu():
@@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int):
 
     Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
     """
-    test_skipif = large_gpu_mark(min_gb)
+    mark = large_gpu_mark(min_gb)
 
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_skipif(f)
+        return mark(f)
 
     return wrapper
 
 
-def multi_gpu_test(*, num_gpus: int):
-    """
-    Decorate a test to be run only when multiple GPUs are available.
-    """
-    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+def multi_gpu_marks(*, num_gpus: int):
+    """Get a collection of pytest marks to apply for `@multi_gpu_test`."""
+    test_selector = pytest.mark.distributed(num_gpus=num_gpus)
     test_skipif = pytest.mark.skipif(
         cuda_device_count_stateless() < num_gpus,
         reason=f"Need at least {num_gpus} GPUs to run the test.",
     )
 
+    return [test_selector, test_skipif]
+
+
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    marks = multi_gpu_marks(num_gpus=num_gpus)
+
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+        func = fork_new_process_for_each_test(f)
+        for mark in reversed(marks):
+            func = mark(func)
+
+        return func
 
     return wrapper
 

From 452a723bf2e8410ee9b47f82f90c7ea48aa6d14f Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 11 Dec 2024 18:34:54 -0500
Subject: [PATCH 06/85] [V1][Core] Remove should_shutdown to simplify core
 process termination (#11113)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core.py        | 13 ++-----------
 vllm/v1/engine/core_client.py |  6 ------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 55a5c4dff3a5c..a26ffe74a3ae8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -5,7 +5,6 @@
 import threading
 import time
 from multiprocessing.process import BaseProcess
-from multiprocessing.sharedctypes import Synchronized
 from typing import List, Tuple, Type, Union
 
 import zmq
@@ -133,13 +132,9 @@ def __init__(
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
     ):
         super().__init__(vllm_config, executor_class, usage_context)
 
-        # Signal from main process to shutdown (multiprocessing.Value).
-        self.should_shutdown = should_shutdown
-
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
@@ -195,7 +190,6 @@ def make_engine_core_process(
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
     ) -> BaseProcess:
         # The current process might have CUDA context,
         # so we need to spawn a new process.
@@ -210,7 +204,6 @@ def make_engine_core_process(
             "vllm_config": vllm_config,
             "executor_class": executor_class,
             "usage_context": usage_context,
-            "should_shutdown": should_shutdown
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=EngineCoreProc.run_engine_core,
@@ -260,8 +253,8 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        # Loop until we get a shutdown signal.
-        while not self.should_shutdown:
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
@@ -272,8 +265,6 @@ def run_busy_loop(self):
                     except queue.Empty:
                         self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
-                        if self.should_shutdown:
-                            return
                     except BaseException:
                         raise
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4d96b323d1662..1d5ddf4db4d7c 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,5 +1,4 @@
 import atexit
-import multiprocessing
 from typing import List, Union
 
 import msgspec
@@ -149,21 +148,16 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.should_shutdown = multiprocessing.Value('b', False, lock=False)
         self.proc = EngineCoreProc.make_engine_core_process(
             *args,
             input_path=input_path,
             output_path=output_path,
             ready_path=ready_path,
-            should_shutdown=self.should_shutdown,
             **kwargs,
         )
         atexit.register(self.shutdown)
 
     def shutdown(self):
-        # Send shutdown signal to background process.
-        self.should_shutdown = True
-
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 

From 4e116833686f3e0c0a223b05b5859ad76843a017 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Wed, 11 Dec 2024 19:55:30 -0500
Subject: [PATCH 07/85] [V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_vision_language.py | 126 ++++++++++++--
 requirements-common.txt                       |   1 +
 tests/v1/engine/test_engine_core.py           |   1 +
 tests/v1/engine/test_engine_core_client.py    |   1 +
 vllm/config.py                                |  10 +-
 vllm/engine/arg_utils.py                      |   8 +
 vllm/v1/engine/__init__.py                    |   3 +-
 vllm/v1/engine/core.py                        |  18 +-
 vllm/v1/engine/mm_input_mapper.py             | 156 ++++++++++++++++--
 vllm/v1/engine/processor.py                   |  35 ++--
 vllm/v1/utils.py                              |  21 +++
 11 files changed, 332 insertions(+), 48 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index c6a274ee5894b..5e210126dc8fe 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -5,6 +5,8 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import random
+
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
@@ -23,7 +25,9 @@ def run_llava(question: str, modality: str):
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -33,7 +37,9 @@ def run_llava_next(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -44,7 +50,9 @@ def run_llava_next_video(question: str, modality: str):
     assert modality == "video"
 
     prompt = f"USER: <video>\n{question} ASSISTANT:"
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -61,7 +69,8 @@ def run_llava_onevision(question: str, modality: str):
         <|im_start|>assistant\n"
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=16384)
+              max_model_len=16384,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -71,7 +80,10 @@ def run_fuyu(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
+    llm = LLM(model="adept/fuyu-8b",
+              max_model_len=2048,
+              max_num_seqs=2,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -107,6 +119,7 @@ def run_phi3v(question: str, modality: str):
         max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={"num_crops": 16},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     stop_token_ids = None
     return llm, prompt, stop_token_ids
@@ -118,7 +131,8 @@ def run_paligemma(question: str, modality: str):
 
     # PaliGemma has special prompt format for VQA
     prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224")
+    llm = LLM(model="google/paligemma-3b-mix-224",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -128,7 +142,9 @@ def run_chameleon(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
+    llm = LLM(model="facebook/chameleon-7b",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -154,6 +170,7 @@ def run_minicpmv(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -186,6 +203,7 @@ def run_h2ovl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -211,6 +229,7 @@ def run_internvl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -241,6 +260,7 @@ def run_nvlm_d(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -260,7 +280,8 @@ def run_blip2(question: str, modality: str):
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompt = f"Question: {question} Answer:"
-    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -274,6 +295,7 @@ def run_qwen_vl(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=1024,
         max_num_seqs=2,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"{question}Picture 1: <img></img>\n"
@@ -296,6 +318,7 @@ def run_qwen2_vl(question: str, modality: str):
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
         },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
@@ -315,6 +338,7 @@ def run_pixtral_hf(question: str, modality: str):
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"<s>[INST]{question}\n[IMG][/INST]"
@@ -338,6 +362,7 @@ def run_mllama(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=16,
         enforce_eager=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"<|image|><|begin_of_text|>{question}"
@@ -355,6 +380,7 @@ def run_molmo(question, modality):
         model=model_name,
         trust_remote_code=True,
         dtype="bfloat16",
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = question
@@ -371,7 +397,8 @@ def run_glm4v(question: str, modality: str):
               max_model_len=2048,
               max_num_seqs=2,
               trust_remote_code=True,
-              enforce_eager=True)
+              enforce_eager=True,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     prompt = question
     stop_token_ids = [151329, 151336, 151338]
     return llm, prompt, stop_token_ids
@@ -394,6 +421,7 @@ def run_idefics3(question: str, modality: str):
                 "longest_edge": 3 * 364
             },
         },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     prompt = (
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -410,7 +438,8 @@ def run_aria(question: str, modality: str):
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
               trust_remote_code=True,
-              dtype="bfloat16")
+              dtype="bfloat16",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
               "<|im_end|>\n<|im_start|>assistant\n")
@@ -430,6 +459,7 @@ def run_mantis(question: str, modality: str):
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
@@ -494,6 +524,35 @@ def get_multi_modal_input(args):
     raise ValueError(msg)
 
 
+def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -524,14 +583,29 @@ def main(args):
 
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompt,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompt,
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for _ in range(args.num_prompts)]
+
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
@@ -561,5 +635,23 @@ def main(args):
                         type=int,
                         default=16,
                         help='Number of frames to extract from the video.')
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--mm-cache-preprocessor',
+        action='store_true',
+        help='If True, enable caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
     args = parser.parse_args()
     main(args)
diff --git a/requirements-common.txt b/requirements-common.txt
index 850b8f4101701..11984260c580d 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -3,6 +3,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
 requests >= 2.26.0
 tqdm
+blake3
 py-cpuinfo
 transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index fef44ac29c41f..a61ec63a365b5 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -28,6 +28,7 @@ def make_request() -> EngineCoreRequest:
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
         sampling_params=SamplingParams(),
         eos_token_id=None,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 4e003a25e91d2..2f1cbec607a91 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -30,6 +30,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
         sampling_params=params,
         eos_token_id=None,
diff --git a/vllm/config.py b/vllm/config.py
index 7f9be5a3a98bc..08a7b607630af 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -147,6 +147,9 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
+        mm_cache_preprocessor: If true, then enables caching of the multi-modal 
+            preprocessor/mapper. Otherwise, the mapper executes each time, and 
+            for better performance consider enabling frontend process.
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
@@ -185,6 +188,7 @@ def __init__(
             config_format: ConfigFormat = ConfigFormat.AUTO,
             hf_overrides: Optional[HfOverrides] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+            mm_cache_preprocessor: bool = False,
             override_neuron_config: Optional[Dict[str, Any]] = None,
             override_pooler_config: Optional["PoolerConfig"] = None) -> None:
         self.model = model
@@ -251,6 +255,7 @@ def __init__(
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
+        self.mm_cache_preprocessor = mm_cache_preprocessor
 
         # Set enforce_eager to False if the value is unset.
         if self.enforce_eager is None:
@@ -2686,9 +2691,10 @@ def __str__(self):
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
+            f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, "  # noqa
             f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
-            f"pooler_config={self.model_config.pooler_config!r},"
-            f" compilation_config={self.compilation_config!r}")
+            f"pooler_config={self.model_config.pooler_config!r}, "
+            f"compilation_config={self.compilation_config!r}")
 
 
 _current_vllm_config: Optional[VllmConfig] = None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7337522bc9952..0c28fe7032728 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -143,6 +143,7 @@ class EngineArgs:
     tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
+    mm_cache_preprocessor: bool = False
     enable_lora: bool = False
     enable_lora_bias: bool = False
     max_loras: int = 1
@@ -593,6 +594,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=json.loads,
             help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
+        parser.add_argument(
+            '--mm-cache-preprocessor',
+            action='store_true',
+            help='If true, then enables caching of the multi-modal '
+            'preprocessor/mapper. Otherwise, the mapper executes each time'
+            ', and for better performance consider enabling frontend process.')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',
@@ -965,6 +972,7 @@ def create_model_config(self) -> ModelConfig:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
+            mm_cache_preprocessor=self.mm_cache_preprocessor,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
         )
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3cf0e610ae7af..abeea052c1fa5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -35,7 +35,8 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[MultiModalKwargs]]
+    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_hashes: Optional[List[Optional[str]]]
     mm_placeholders: Optional[MultiModalPlaceholderDict]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a26ffe74a3ae8..877a35e36427a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -18,7 +18,7 @@
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType)
-from vllm.v1.engine.mm_input_mapper import MMInputMapper
+from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
@@ -55,9 +55,6 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
-        # Set up multimodal input mapper (e.g., convert PIL images to tensors).
-        self.mm_input_mapper = MMInputMapper(vllm_config.model_config)
-
         # Setup scheduler.
         self.scheduler = Scheduler(vllm_config.scheduler_config,
                                    vllm_config.cache_config,
@@ -65,6 +62,8 @@ def __init__(
 
         self._last_logging_time = time.time()
 
+        self.mm_input_mapper_server = MMInputMapperServer()
+
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
         start = time.time()
@@ -88,7 +87,18 @@ def _initialize_kv_caches(self,
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
+
+        if request.mm_hashes is not None:
+            # Here, if hash exists for an image, then it will be fetched
+            # from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client side of the
+            # MM mapper, so anything that has a hash must have a HIT cache
+            # entry here as well.
+            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+                request.mm_inputs, request.mm_hashes)
+
         req = Request.from_engine_core_request(request)
+
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 7ad6882b04520..58ee29bedb201 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,11 +1,35 @@
 from typing import Any, Dict, List, Optional
 
+import PIL
+from blake3 import blake3
+
 from vllm.config import ModelConfig
+from vllm.inputs import PromptType
+from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
+from vllm.v1.utils import LRUDictCache
+
+logger = init_logger(__name__)
+
+# The idea of MM preprocessor caching is based on having a client and a server,
+# where the client executes in the frontend process (=P0) and the server in the
+# core process (=P1).
+#
+# -- Client: Executes the MM mapper and performs caching of the results.
+# -- Server: Performs caching of the results
+#
+# The caching for both client and server is mirrored/similar, and this allows us
+# to avoid the serialization of "mm_inputs" (like pixel values) between
+# client (=P0) and server (=P1) processes.
 
+# Both Client and Server must use the same cache size
+# (to perform mirrored caching)
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
 
-class MMInputMapper:
+
+class MMInputMapperClient:
 
     def __init__(
         self,
@@ -18,23 +42,131 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
+        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+
+        # DEBUG: Set to None to disable
+        self.mm_debug_cache_hit_ratio_steps = None
+        self.mm_cache_hits = 0
+        self.mm_cache_total = 0
+
+    def cache_hit_ratio(self, steps) -> float:
+        if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
+            logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
+                         self.mm_cache_hits / self.mm_cache_total)
+
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
+        mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
+        precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
+    ) -> List[MultiModalKwargs]:
+        if precomputed_mm_inputs is None:
+            image_inputs = mm_data["image"]
+            if not isinstance(image_inputs, list):
+                image_inputs = [image_inputs]
+            num_inputs = len(image_inputs)
+        else:
+            num_inputs = len(precomputed_mm_inputs)
+
+        # Check if hash is enabled
+        use_hash = mm_hashes is not None
+        if use_hash:
+            assert num_inputs == len(
+                mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format(
+                    num_inputs, len(mm_hashes))
+
+        # Process each image input separately, so that later we can schedule
+        # them in a fine-grained manner.
+        # Apply caching (if enabled) and reuse precomputed inputs (if provided)
+        ret_hashes = [] if use_hash else None
+        ret_inputs: List[MultiModalKwargs] = []
+        for input_id in range(num_inputs):
+            if self.mm_debug_cache_hit_ratio_steps is not None:
+                self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
+
+            mm_hash = None
+            mm_input = None
+            if use_hash:
+                mm_hash = mm_hashes[input_id]
+                mm_input = self.mm_cache.get(mm_hash)
+
+            self.mm_cache_total += 1
+            if mm_input is None:
+                if precomputed_mm_inputs is not None:
+                    # Reuse precomputed input (for merged preprocessor)
+                    mm_input = precomputed_mm_inputs[input_id]
+                else:
+                    # Apply MM mapper
+                    mm_input = self.multi_modal_input_mapper(
+                        {"image": [image_inputs[input_id]]},
+                        mm_processor_kwargs=mm_processor_kwargs,
+                    )
+
+                if use_hash:
+                    # Add to cache
+                    self.mm_cache.put(mm_hash, mm_input)
+            else:
+                self.mm_cache_hits += 1
+                mm_input = None  # Avoids sending mm_input to Server
+
+            if use_hash:
+                ret_hashes.append(mm_hash)
+            ret_inputs.append(mm_input)
+
+        return ret_inputs, ret_hashes
+
+
+class MMInputMapperServer:
+
+    def __init__(self, ):
+        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+
+    def process_inputs(
+        self,
+        mm_inputs: List[Optional[MultiModalKwargs]],
+        mm_hashes: List[Optional[str]],
     ) -> List[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        full_mm_inputs = []
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if mm_input is None:
+                mm_input = self.mm_cache.get(mm_hash)
+                assert mm_input is not None
+            else:
+                self.mm_cache.put(mm_hash, mm_input)
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+
+class MMHasher:
+
+    def __init__(self):
+        pass
+
+    def hash(self, prompt: PromptType) -> Optional[List[str]]:
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
         image_inputs = mm_data["image"]
         if not isinstance(image_inputs, list):
             image_inputs = [image_inputs]
+        assert len(image_inputs) > 0
 
-        # Process each image input separately so that later we can schedule
-        # them in a fine-grained manner.
-        mm_inputs: List[MultiModalKwargs] = []
-        num_images = len(image_inputs)
-        for i in range(num_images):
-            mm_input = self.multi_modal_input_mapper(
-                {"image": image_inputs[i]},
-                mm_processor_kwargs=mm_processor_kwargs,
-            )
-            mm_inputs.append(mm_input)
-        return mm_inputs
+        ret = []
+        for image in image_inputs:
+            assert isinstance(image, PIL.Image.Image)
+
+            # Convert image to bytes
+            bytes = image.tobytes()
+
+            # Hash image bytes
+            hasher = blake3()
+            hasher.update(bytes)
+            ret.append(hasher.hexdigest())
+
+        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index e0e525b30a767..903996bad3726 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -15,7 +15,7 @@
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMInputMapper
+from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
 class Processor:
@@ -42,7 +42,11 @@ def __init__(
             model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_mapper = MMInputMapper(model_config)
+        self.mm_input_mapper_client = MMInputMapperClient(model_config)
+
+        # Multi-modal hasher (for images)
+        self.mm_hasher = MMHasher(
+        ) if model_config.mm_cache_preprocessor else None
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -71,6 +75,11 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.mm_hasher is not None:
+            mm_hashes = self.mm_hasher.hash(prompt)
+
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -101,16 +110,17 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
-        # Preprocess multi-modal data
-        if len(decoder_inputs.multi_modal_data) == 0:
-            mm_inputs = None
-        elif isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
-            mm_inputs = [decoder_inputs.multi_modal_data]
-        else:
-            mm_inputs = self.mm_input_mapper.process_inputs(
-                decoder_inputs.multi_modal_data,
-                decoder_inputs.mm_processor_kwargs,
-            )
+        # For merged preprocessor, mm_data is already mm_inputs
+        precomputed_mm_inputs = None
+        if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
+            precomputed_mm_inputs = [decoder_inputs.multi_modal_data]
+
+        # Apply MM mapper
+        mm_inputs = None
+        if len(decoder_inputs.multi_modal_data) > 0:
+            mm_inputs, mm_hashes = self.mm_input_mapper_client.process_inputs(
+                decoder_inputs.multi_modal_data, mm_hashes,
+                decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
         # Make Request for Detokenizer.
         detokenizer_request = DetokenizerRequest(
@@ -130,6 +140,7 @@ def process_inputs(
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
             mm_inputs,
+            mm_hashes,
             decoder_inputs.multi_modal_placeholders,
             sampling_params,
             eos_token_id,
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6e7a7d4fe12cd..6ecf20e717ca3 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,3 +1,4 @@
+from collections import OrderedDict
 from contextlib import contextmanager
 from typing import Any, Generic, Iterator, List, TypeVar, overload
 
@@ -93,3 +94,23 @@ def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
 
     finally:
         ctx.destroy(linger=0)
+
+
+class LRUDictCache:
+
+    def __init__(self, size: int):
+        self.cache = OrderedDict()
+        self.size = size
+
+    def get(self, key, default=None):
+        if key not in self.cache:
+            return default
+
+        self.cache.move_to_end(key)
+        return self.cache[key]
+
+    def put(self, key, value):
+        self.cache[key] = value
+        self.cache.move_to_end(key)
+        if len(self.cache) > self.size:
+            self.cache.popitem(last=False)

From 7439a8b5fcbc4d77bd73496f27d4048c5b43cb22 Mon Sep 17 00:00:00 2001
From: Clayton <132770471+cedonley@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:10:12 -0800
Subject: [PATCH 08/85] [Bugfix] Multiple fixes to tool streaming with hermes
 and mistral (#10979)

Signed-off-by: cedonley <clayton@donley.io>
---
 vllm/entrypoints/openai/serving_chat.py       | 16 +++++-
 .../openai/tool_parsers/hermes_tool_parser.py | 51 +++++++++++++++----
 .../tool_parsers/mistral_tool_parser.py       | 23 ++++++---
 3 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 0af7613a473a4..0738210e27cb6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -496,21 +496,33 @@ async def chat_completion_stream_generator(
 
                         if self._should_check_for_unstreamed_tool_arg_tokens(
                                 delta_message, output) and tool_parser:
+                            latest_delta_len = 0
+                            if ((isinstance(
+                                    delta_message.tool_calls[0].function,
+                                    DeltaFunctionCall)) and isinstance(
+                                        delta_message.tool_calls[0].function.
+                                        arguments, str)):
+                                latest_delta_len = len(
+                                    delta_message.tool_calls[0].function.
+                                    arguments)
+
                             # get the expected call based on partial JSON
                             # parsing which "autocompletes" the JSON
                             expected_call = json.dumps(
                                 tool_parser.prev_tool_call_arr[index].get(
-                                    "arguments", {}))
+                                    "arguments", {}),
+                                ensure_ascii=False)
 
                             # get what we've streamed so far for arguments
                             # for the current tool
                             actual_call = tool_parser.streamed_args_for_tool[
                                 index]
+                            if (latest_delta_len > 0):
+                                actual_call = actual_call[:-latest_delta_len]
 
                             # check to see if there's anything left to stream
                             remaining_call = expected_call.replace(
                                 actual_call, "", 1)
-
                             # set that as a delta message
                             delta_message = DeltaMessage(tool_calls=[
                                 DeltaToolCall(index=index,
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 18816cd665b3e..869d15ac359ea 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -91,7 +91,8 @@ def extract_tool_calls(
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"])))
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False)))
                     for function_call in raw_function_calls
                 ]
 
@@ -139,13 +140,26 @@ def extract_tool_calls_streaming(
                 self.tool_call_start_token_id)
             cur_tool_end_count = current_token_ids.count(
                 self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
             if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count):
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
                 logger.debug("Generating text content! skipping tool parsing.")
-                if delta_text != self.tool_call_end_token:
-                    return DeltaMessage(content=delta_text)
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
 
             # case: if tool open & close tag counts don't match, we're doing
             # imaginary "else" block here
@@ -184,15 +198,21 @@ def extract_tool_calls_streaming(
 
             # case -- the current tool call is being closed.
             elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count > prev_tool_end_count):
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if (self.prev_tool_call_arr is None
+                        or len(self.prev_tool_call_arr) == 0):
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
                 diff = self.prev_tool_call_arr[self.current_tool_id].get(
                     "arguments")
                 if diff:
                     diff = diff.encode('utf-8').decode(
                         'unicode_escape') if diff is str else diff
-                    diff = json.dumps(
-                        diff, ensure_ascii=False
-                    )[len(self.streamed_args_for_tool[self.current_tool_id]):]
+                    if ('"}' not in delta_text):
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
                     logger.debug(
                         "Finishing tool and found diff that had not "
                         "been streamed yet: %s", diff)
@@ -221,10 +241,15 @@ def extract_tool_calls_streaming(
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
                 return None
+            except json.decoder.JSONDecodeError:
+                logger.debug("unable to parse JSON")
+                return None
 
             # case - we haven't sent the tool name yet. If it's available, send
             #   it. otherwise, wait until it's available.
             if not self.current_tool_name_sent:
+                if (current_tool_call is None):
+                    return None
                 function_name: Union[str, None] = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
@@ -284,13 +309,17 @@ def extract_tool_calls_streaming(
             #   autocompleting the JSON
             elif cur_arguments and not prev_arguments:
 
-                cur_arguments_json = json.dumps(cur_arguments)
+                cur_arguments_json = json.dumps(cur_arguments,
+                                                ensure_ascii=False)
                 logger.debug("finding %s in %s", delta_text,
                              cur_arguments_json)
 
                 # get the location where previous args differ from current
-                args_delta_start_loc = cur_arguments_json.index(delta_text) \
-                                       + len(delta_text)
+                if (delta_text not in cur_arguments_json[:-2]):
+                    return None
+                args_delta_start_loc = cur_arguments_json[:-2]. \
+                                           rindex(delta_text) + \
+                                           len(delta_text)
 
                 # use that to find the actual delta
                 arguments_delta = cur_arguments_json[:args_delta_start_loc]
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 5caac84138e3b..bada805dd35b9 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -19,7 +19,6 @@
     extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -109,7 +108,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(raw_function_call["arguments"])))
+                        arguments=json.dumps(raw_function_call["arguments"],
+                                             ensure_ascii=False)))
                 for raw_function_call in function_call_arr
             ]
 
@@ -199,7 +199,7 @@ def extract_tool_calls_streaming(
                     diff: Union[str, None] = current_tool_call.get("arguments")
 
                     if diff:
-                        diff = json.dumps(diff).replace(
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
                             self.streamed_args_for_tool[self.current_tool_id],
                             "")
                         delta = DeltaMessage(tool_calls=[
@@ -232,7 +232,7 @@ def extract_tool_calls_streaming(
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=f"chatcmpl-tool-{random_uuid()}",
+                                      id=MistralToolCall.generate_random_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
@@ -250,6 +250,8 @@ def extract_tool_calls_streaming(
                 cur_arguments = current_tool_call.get("arguments")
 
                 new_text = delta_text.replace("\'", "\"")
+                if ('"}' in new_text):
+                    new_text = new_text[:new_text.rindex('"}')]
 
                 if not cur_arguments and not prev_arguments:
 
@@ -260,12 +262,15 @@ def extract_tool_calls_streaming(
                         "mid-arguments")
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)[:-2]
                     logger.debug("finding %s in %s", new_text,
                                  cur_arguments_json)
 
+                    if (new_text not in cur_arguments_json):
+                        return None
                     arguments_delta = cur_arguments_json[:cur_arguments_json.
-                                                         index(new_text) +
+                                                         rindex(new_text) +
                                                          len(new_text)]
                     logger.debug("First tokens in arguments received: %s",
                                  arguments_delta)
@@ -279,8 +284,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
                     logger.debug("Searching for diff between \n%s\n%s",
                                  cur_args_json, prev_args_json)
 

From 8fb26dac614425de5b14f8e77a10bde35bacf155 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 11 Dec 2024 17:33:11 -0800
Subject: [PATCH 09/85] [Docs] Add media kit (#11121)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index ed5161ccffb45..93b71ddaccc61 100644
--- a/README.md
+++ b/README.md
@@ -134,3 +134,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 * For coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+
+## Media Kit
+
+* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

From 24a36d6d5f789fd2d5105174c24528fc7e659b00 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Wed, 11 Dec 2024 21:39:21 -0500
Subject: [PATCH 10/85] Update link to LlamaStack remote vLLM guide in
 serving_with_llamastack.rst (#11112)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/serving/serving_with_llamastack.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst
index 8ef96c4e54369..a2acd7b39f887 100644
--- a/docs/source/serving/serving_with_llamastack.rst
+++ b/docs/source/serving/serving_with_llamastack.rst
@@ -24,7 +24,7 @@ Then start Llama Stack server pointing to your vLLM server with the following co
         config:
           url: http://127.0.0.1:8000
 
-Please refer to `this guide <https://github.com/meta-llama/llama-stack/blob/main/docs/source/getting_started/distributions/self_hosted_distro/remote_vllm.md>`_ for more details on this remote vLLM provider.
+Please refer to `this guide <https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html>`_ for more details on this remote vLLM provider.
 
 Inference via Embedded vLLM
 ---------------------------

From ccede2b264668d854cba4fce7f8fbbf203908f60 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 11 Dec 2024 22:12:24 -0500
Subject: [PATCH 11/85] [Core] cleanup zmq ipc sockets on exit (#11115)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/entrypoints/openai/api_server.py  |  9 +++++++++
 vllm/v1/engine/core.py                 | 16 +++++++++++++--
 vllm/v1/engine/core_client.py          | 28 ++++++++++++++++++--------
 vllm/v1/executor/multiproc_executor.py | 21 ++++++++++++-------
 4 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a345f8caeeed2..2e27224b41864 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,5 @@
 import asyncio
+import atexit
 import importlib
 import inspect
 import multiprocessing
@@ -196,6 +197,14 @@ async def build_async_engine_client_from_engine_args(
         assert engine_pid is not None, "Engine process failed to start."
         logger.info("Started engine process with PID %d", engine_pid)
 
+        def _cleanup_ipc_path():
+            socket_path = ipc_path.replace("ipc://", "")
+            if os.path.exists(socket_path):
+                os.remove(socket_path)
+
+        # Ensure we clean up the local IPC socket file on exit.
+        atexit.register(_cleanup_ipc_path)
+
         # Build RPCClient, which conforms to EngineClient Protocol.
         engine_config = engine_args.create_engine_config()
         build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 877a35e36427a..6246a0067842a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -4,6 +4,7 @@
 import signal
 import threading
 import time
+from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
 from typing import List, Tuple, Type, Union
 
@@ -129,6 +130,14 @@ def profile(self, is_start=True):
         self.model_executor.profile(is_start)
 
 
+@dataclass
+class EngineCoreProcHandle:
+    proc: BaseProcess
+    ready_path: str
+    input_path: str
+    output_path: str
+
+
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
@@ -200,7 +209,7 @@ def make_engine_core_process(
         input_path: str,
         output_path: str,
         ready_path: str,
-    ) -> BaseProcess:
+    ) -> EngineCoreProcHandle:
         # The current process might have CUDA context,
         # so we need to spawn a new process.
         # NOTE(rob): this is a problem for using EngineCoreProc w/
@@ -222,7 +231,10 @@ def make_engine_core_process(
 
         # Wait for startup
         EngineCoreProc.wait_for_startup(proc, ready_path)
-        return proc
+        return EngineCoreProcHandle(proc=proc,
+                                    ready_path=ready_path,
+                                    input_path=input_path,
+                                    output_path=output_path)
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 1d5ddf4db4d7c..8eb9a27438d53 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,4 +1,5 @@
 import atexit
+import os
 from typing import List, Union
 
 import msgspec
@@ -148,7 +149,7 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.proc = EngineCoreProc.make_engine_core_process(
+        self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
             input_path=input_path,
             output_path=output_path,
@@ -161,13 +162,24 @@ def shutdown(self):
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
-        # Shutdown the process if needed.
-        if hasattr(self, "proc") and self.proc.is_alive():
-            self.proc.terminate()
-            self.proc.join(5)
-
-            if self.proc.is_alive():
-                kill_process_tree(self.proc.pid)
+        if hasattr(self, "proc_handle"):
+            # Shutdown the process if needed.
+            if self.proc_handle.proc.is_alive():
+                self.proc_handle.proc.terminate()
+                self.proc_handle.proc.join(5)
+
+                if self.proc_handle.proc.is_alive():
+                    kill_process_tree(self.proc_handle.proc.pid)
+
+            # Remove zmq ipc socket files
+            ipc_sockets = [
+                self.proc_handle.ready_path, self.proc_handle.output_path,
+                self.proc_handle.input_path
+            ]
+            for ipc_socket in ipc_sockets:
+                socket_file = ipc_socket.replace("ipc://", "")
+                if os.path.exists(socket_file):
+                    os.remove(socket_file)
 
     def __del__(self):
         self.shutdown()
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index f8f3d583618cf..63a12f791051f 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -172,16 +172,23 @@ def wait_for_termination(procs, timeout):
 
         # Send SIGTERM if still running
         active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
-        self.workers = None
         for p in active_procs:
             p.terminate()
-        if wait_for_termination(active_procs, 4):
-            return
+        if not wait_for_termination(active_procs, 4):
+            # Send SIGKILL if still running
+            active_procs = [p for p in active_procs if p.is_alive()]
+            for p in active_procs:
+                p.kill()
 
-        # Send SIGKILL if still running
-        active_procs = [p for p in active_procs if p.is_alive()]
-        for p in active_procs:
-            p.kill()
+        self._cleanup_sockets()
+        self.workers = None
+
+    def _cleanup_sockets(self):
+        for w in self.workers:
+            # Remove the zmq ipc socket file
+            socket_path = w.ready_path.replace("ipc://", "")
+            if os.path.exists(socket_path):
+                os.remove(socket_path)
 
     def shutdown(self):
         """Properly shut down the executor and its workers"""

From 1da8f0e1dddaf8625829e7ecca7fce93eb685c03 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Wed, 11 Dec 2024 22:39:16 -0800
Subject: [PATCH 12/85] [Model] Add support for embedding model GritLM (#10816)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 docs/source/models/supported_models.rst       |  10 +
 .../models/embedding/language/test_gritlm.py  | 200 ++++++++++++++
 tests/models/registry.py                      |   1 +
 vllm/model_executor/models/gritlm.py          | 245 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   2 +
 5 files changed, 458 insertions(+)
 create mode 100644 tests/models/embedding/language/test_gritlm.py
 create mode 100644 vllm/model_executor/models/gritlm.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index b9957cf9563b1..35aa3bfdd12b7 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -203,6 +203,11 @@ Text Generation (``--task generate``)
     - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
@@ -395,6 +400,11 @@ The following table lists those that are tested in vLLM.
     - :code:`BAAI/bge-multilingual-gemma2`, etc.
     - 
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
     - Llama-based
     - :code:`intfloat/e5-mistral-7b-instruct`, etc.
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
new file mode 100644
index 0000000000000..b947265be9e9d
--- /dev/null
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -0,0 +1,200 @@
+import importlib.util
+import math
+from array import array
+from typing import List
+
+import openai
+import pytest
+import pytest_asyncio
+from scipy.spatial.distance import cosine
+
+import vllm
+import vllm.config
+
+from ....utils import RemoteOpenAIServer
+
+# GritLM embedding implementation is only supported by XFormers backend.
+pytest.mark.skipif(not importlib.util.find_spec("xformers"),
+                   reason="GritLM requires XFormers")
+
+MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
+MAX_MODEL_LEN = 4000
+
+
+def _arr(arr):
+    """
+    Convert a list of integers to an array of integers.
+    """
+    return array("i", arr)
+
+
+def test_find_array(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    from vllm.model_executor.models.gritlm import GritLMPooler
+
+    # Create an LLM object to get the model config.
+    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+
+
+@pytest.fixture(scope="module")
+def server_embedding():
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+        args = ["--task", "embedding", "--max_model_len", str(MAX_MODEL_LEN)]
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_generate():
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_embedding(server_embedding: RemoteOpenAIServer):
+    async with server_embedding.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest_asyncio.fixture
+async def client_generate(server_generate: RemoteOpenAIServer):
+    async with server_generate.get_async_client() as async_client:
+        yield async_client
+
+
+def run_llm_encode(llm: vllm.LLM, queries: List[str],
+                   instruction: str) -> List[float]:
+    outputs = llm.encode([instruction + q for q in queries], )
+    return [output.outputs.embedding for output in outputs]
+
+
+async def run_client_embeddings(client: vllm.LLM, queries: List[str],
+                                instruction: str) -> List[float]:
+    outputs = await client.embeddings.create(
+        model=MODEL_NAME,
+        input=[instruction + q for q in queries],
+    )
+    return [data.embedding for data in outputs.data]
+
+
+def gritlm_instruction(instruction):
+    return ("<|user|>\n" + instruction +
+            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+
+
+def get_test_data():
+    """
+    Grabbed this test data and the expected values from
+    README.md in https://github.com/ContextualAI/gritlm
+    """
+    q_instruction = gritlm_instruction(
+        "Given a scientific paper title, retrieve the paper's abstract")
+    queries = [
+        "Bitcoin: A Peer-to-Peer Electronic Cash System",
+        "Generative Representational Instruction Tuning",
+    ]
+
+    d_instruction = gritlm_instruction("")
+    documents = [
+        # ruff: noqa: E501
+        "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+        "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+    ]
+
+    return queries, q_instruction, documents, d_instruction
+
+
+def validate_embed_output(q_rep: List[float], d_rep: List[float]):
+    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
+    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
+
+    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
+    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
+
+    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
+    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
+
+    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
+    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
+
+
+def test_gritlm_offline_embedding(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+
+    d_rep = run_llm_encode(
+        llm,
+        documents,
+        d_instruction,
+    )
+    q_rep = run_llm_encode(
+        llm,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding(
+        client_embedding: openai.AsyncOpenAI):
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    d_rep = await run_client_embeddings(
+        client_embedding,
+        documents,
+        d_instruction,
+    )
+    q_rep = await run_client_embeddings(
+        client_embedding,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+def test_gritlm_offline_gen():
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
+    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
+    outputs = llm.generate(input, sampling_params=sampling_params)
+
+    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    outputs = await client_generate.completions.create(
+        model=MODEL_NAME,
+        prompt=input,
+        max_tokens=256,
+        temperature=0.0,
+    )
+
+    assert outputs.choices[0].text == "The capital of France is Paris."
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a89518820045f..6a8b1742ceae3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -135,6 +135,7 @@ class _HfExamplesInfo:
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
+    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
new file mode 100644
index 0000000000000..ec01a07c16a62
--- /dev/null
+++ b/vllm/model_executor/models/gritlm.py
@@ -0,0 +1,245 @@
+from array import array
+from typing import List, Optional, Union
+
+import torch
+from torch import nn
+from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+from vllm.attention import AttentionMetadata
+from vllm.attention.backends.xformers import XFormersImpl
+from vllm.config import ModelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.pooling_metadata import (PoolingMetadata,
+                                                  PoolingTensors)
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import (EmbeddingSequenceGroupOutput, IntermediateTensors,
+                           PoolerOutput)
+
+logger = init_logger(__name__)
+
+
+class GritLMPooler(nn.Module):
+
+    def __init__(self, model_config: ModelConfig):
+        super().__init__()
+
+        self.model_config = model_config
+
+        tokenizer = cached_get_tokenizer(
+            self.model_config.tokenizer,
+            tokenizer_mode=self.model_config.tokenizer_mode,
+            tokenizer_revision=self.model_config.tokenizer_revision,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
+        # Collect the tokens needed for pattern matching.
+        # "▁<" is different from "_<". The former uses "▁" to indicate that
+        # the next token is the start of a word.
+        # "<0x0A>" is the newline token (i.e. "\n")."
+        self.token_ids = {
+            tok: tokenizer.convert_tokens_to_ids([tok])[0]
+            for tok in ["<s>", "▁<", "<", "|", "embed", ">", "<0x0A>", "user"]
+        }
+
+        def tokens_to_ids(tokens: list[str]) -> array:
+            return array("i", [self.token_ids[token] for token in tokens])
+
+        self.user_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "user", "|", ">", "<0x0A>"])
+        self.embed_newline_pattern_ids = tokens_to_ids(
+            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"])
+        self.embed_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "embed", "|", ">", "<0x0A>"])
+
+    def _find_array(self, arr: array, target: array, start_idx: int) -> int:
+        """
+        Find the first occurrence of target in arr starting from start_idx.
+
+        Args:
+        arr: The array to search within
+        target: The consecutive subsequence to find
+        start_idx: The starting index to search from
+
+        Returns:
+        int: The index of the first occurrence of target in arr.
+        """
+        if start_idx < 0:
+            raise ValueError("start_idx must be non-negative")
+        if not target or not arr:
+            raise ValueError("Empty arr or target not allowed")
+
+        target_len = len(target)
+        for i in range(start_idx, len(arr) - target_len + 1):
+            if arr[i:i + target_len] == target:
+                return i
+        return -1
+
+    def _get_instruction_len(self, prompt_token_ids: array) -> bool:
+        """
+        Get the length of the instruction in the prompt.
+
+        We do a pattern matching to find the instruction in the prompt,
+        and then return the length of the instruction.
+
+        The pattern matching is done using integers instead of strings
+        because the prompt is given as a list of token IDs.
+        """
+
+        instruction_len = 0
+
+        # Return no instruction in case of missing BOS token.
+        if prompt_token_ids[0] != self.token_ids["<s>"]:
+            logger.warning("BOS token not found in prompt,"
+                           "thus using empty string for instruction."
+                           "GritLM requires BOS token in prompt.")
+            return instruction_len
+
+        # If user pattern is found in the prompt, that means there should be
+        # a newline token before the embed pattern.
+        embed_pattern_ids = self.embed_pattern_ids
+        if self._find_array(prompt_token_ids,
+                            self.user_pattern_ids,
+                            start_idx=1) == 1:
+            embed_pattern_ids = self.embed_newline_pattern_ids
+
+        # Find the embed pattern in the prompt.
+        found_embed_pattern_idx = self._find_array(prompt_token_ids,
+                                                   embed_pattern_ids,
+                                                   start_idx=1)
+
+        if found_embed_pattern_idx != -1:
+            instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
+        else:
+            logger.warning("Query instruction not found in prompt,"
+                           "thus using BOS token as instruction instead."
+                           "GritLM requires query instruction in prompt.")
+            instruction_len = 1
+
+        return instruction_len
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        """
+        Pool the hidden states by summing the embeddings of
+        non-instruction tokens.
+        """
+        prompts_token_ids = [
+            token_ids.prompt_token_ids_array
+            for _, token_ids in pooling_metadata.seq_data.items()
+        ]
+
+        instruction_lens = torch.tensor(
+            [
+                self._get_instruction_len(prompt_token_ids)
+                for prompt_token_ids in prompts_token_ids
+            ],
+            device=hidden_states.device,
+        )
+
+        prompt_lens = PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+        mask = torch.zeros_like(hidden_states, dtype=torch.bool)
+
+        start_idx = 0
+        for prompt_len, instruction_len in zip(prompt_lens, instruction_lens):
+            end_idx = start_idx + prompt_len
+            mask[start_idx + instruction_len:end_idx] = True
+            start_idx = end_idx
+
+        masked_hidden_states = hidden_states.masked_fill(~mask, 0.0)
+
+        sum_embeddings = torch.zeros(len(prompt_lens),
+                                     hidden_states.size(1),
+                                     device=hidden_states.device)
+
+        start_idx = 0
+        for i, prompt_len in enumerate(prompt_lens):
+            end_idx = start_idx + prompt_len
+            sum_embeddings[i] = masked_hidden_states[start_idx:end_idx].sum(
+                dim=0)
+            start_idx = end_idx
+
+        num_non_instruction_tokens = prompt_lens - instruction_lens
+        mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
+            1)
+
+        pooled_data = nn.functional.normalize(mean_embeddings, p=2, dim=1)
+
+        pooled_outputs = [
+            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
+        ]
+
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class GritLM(LlamaForCausalLM):
+    """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
+
+    The class inherits from LlamaForCausalLM and provides a custom pooling
+    layer.
+
+    The main difference between the pooling layer in GritLM and the one in
+    LlamaForCausalLM is that GritLM ignores the query instruction in the prompt
+    when pooling the hidden states.
+
+    Embedding prompts should be in the following format:
+    - With instruction: "<|user|>\nINSTRUCTION\n<|embed|>\nPROMPT".
+    - Without instruction: "<|embed|>\nPROMPT".
+
+    Generation prompts should be in the following format:
+    - "<|user|>\nPROMPT\n<|assistant|>\n"
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+        self.task = vllm_config.model_config.task
+
+        self._pooler = GritLMPooler(vllm_config.model_config)
+
+        for layer in self.model.layers:
+            if self.task == "embedding" and hasattr(layer, "self_attn"):
+                assert isinstance(layer.self_attn.attn.impl, XFormersImpl), (
+                    "GritLM embedding is only supported by XFormers backend, "
+                    "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        # Change attention to non-causal for embedding task.
+        if self.task == "embedding":
+            assert attn_metadata.prefill_metadata.attn_bias is None
+            attn_metadata.prefill_metadata.attn_bias = [
+                BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)
+            ]
+
+        return super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            **kwargs,
+        )
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4beea4641f5ab..4e77746f312e3 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -56,6 +56,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
@@ -110,6 +111,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all

From f092153fbe349a9a1742940e3703bfcff6aa0a6d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 11 Dec 2024 23:14:20 -0800
Subject: [PATCH 13/85] [V1] Use more persistent buffers to optimize input
 preparation overheads (#11111)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_input_batch.py  |  19 +++--
 vllm/v1/worker/gpu_model_runner.py | 119 ++++++++++++++++-------------
 2 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 25d95ac6e26af..9046b37f60005 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -53,14 +53,23 @@ def __init__(
         self.req_ids: List[Optional[str]] = [None] * max_num_reqs
         self.req_id_to_index: Dict[str, int] = {}
 
-        self.token_ids_cpu = np.empty((max_num_reqs, max_model_len),
-                                      dtype=np.int32)
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
-        self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req),
-                                       device=self.device,
-                                       dtype=torch.int32)
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
         self.block_table_cpu_tensor = torch.zeros(
             (max_num_reqs, max_num_blocks_per_req),
             device="cpu",
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e75be21ef2d91..aa91255e68d48 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -67,6 +67,7 @@ def __init__(
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -88,7 +89,7 @@ def __init__(
         self.requests: Dict[str, CachedRequestState] = {}
         # Persistent batch.
         self.input_batch = InputBatch(
-            max_num_reqs=self.scheduler_config.max_num_seqs,
+            max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
@@ -117,6 +118,32 @@ def __init__(
             dtype=self.dtype,
             device=self.device)
 
+        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.input_ids_np = self.input_ids_cpu.numpy()
+        self.positions_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int64,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.positions_np = self.positions_cpu.numpy()
+        self.slot_mapping_cpu = torch.zeros(self.max_num_tokens,
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=self.pin_memory)
+        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
+        self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=self.pin_memory)
+        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
+        self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                             dtype=torch.int32,
+                                             device="cpu",
+                                             pin_memory=self.pin_memory)
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
@@ -241,22 +268,14 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        indices = np.arange(num_reqs)
-        req_indices = np.repeat(indices, num_scheduled_tokens)
+        req_indices = np.repeat(np.arange(num_reqs), num_scheduled_tokens)
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
-                                (num_reqs, 1))
-        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
-        arange = arange_matrix[mask]
+        arange = np.concatenate([np.arange(n) for n in num_scheduled_tokens])
 
         # Get positions.
-        positions = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        positions_np = positions.numpy()
+        positions_np = self.positions_np[:total_num_scheduled_tokens]
         np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
                arange,
                out=positions_np)
@@ -267,16 +286,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # where M is the max_model_len.
         token_indices = (positions_np +
                          req_indices * self.input_batch.token_ids_cpu.shape[1])
-        token_indices = torch.from_numpy(token_indices)
-        input_ids = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        torch.index_select(torch.from_numpy(
-            self.input_batch.token_ids_cpu).flatten(),
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
                            0,
-                           token_indices,
-                           out=input_ids)
+                           torch.from_numpy(token_indices),
+                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
 
         # Calculate the slot mapping.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
@@ -284,45 +300,40 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # where K is the max_num_blocks_per_req and the block size is 2.
         # NOTE(woosuk): We can't simply use `token_indices // block_size` here
         # because M (max_model_len) is not necessarily divisible by block_size.
-        block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[
-            req_indices * self.max_num_blocks_per_req +
-            positions_np // self.block_size]
-        block_offsets = torch.from_numpy(positions_np % self.block_size)
-        slot_mapping = torch.empty((total_num_scheduled_tokens, ),
-                                   dtype=torch.int32,
-                                   device="cpu",
-                                   pin_memory=self.pin_memory)
-        torch.add(block_numbers * self.block_size,
-                  block_offsets,
-                  out=slot_mapping)
+        block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                               positions_np // self.block_size)
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        block_numbers = (self.input_batch.block_table_cpu_tensor.flatten()
+                         [block_table_indices].numpy())
+        block_offsets = positions_np % self.block_size
+        np.add(block_numbers * self.block_size,
+               block_offsets,
+               out=self.slot_mapping_np[:total_num_scheduled_tokens])
 
         # Prepare the attention metadata.
-        query_start_loc = torch.empty((num_reqs + 1, ),
-                                      dtype=torch.int32,
-                                      device="cpu",
-                                      pin_memory=self.pin_memory)
-        query_start_loc_np = query_start_loc.numpy()
-        query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+        self.query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens,
+                  out=self.query_start_loc_np[1:num_reqs + 1])
 
         seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
                     num_scheduled_tokens)
         max_seq_len = seq_lens.max()
-        seq_start_loc = torch.empty((num_reqs + 1, ),
-                                    dtype=torch.int32,
-                                    device="cpu",
-                                    pin_memory=self.pin_memory)
-        seq_start_loc_np = seq_start_loc.numpy()
-        seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens, out=seq_start_loc_np[1:])
-
-        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
-                                                          non_blocking=True)
-        self.positions[:total_num_scheduled_tokens].copy_(positions,
-                                                          non_blocking=True)
-        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
-        seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
-        slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
+
+        # Copy the tensors to the GPU.
+        self.input_ids[:total_num_scheduled_tokens].copy_(
+            self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        self.positions[:total_num_scheduled_tokens].copy_(
+            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
+            self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,

From 8195824206ad2e3c45d1807b321c11f06ccb3a91 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Thu, 12 Dec 2024 13:39:28 +0530
Subject: [PATCH 14/85] [Hardware][Intel-Gaudi] Enable LoRA support for Intel
 Gaudi (HPU) (#10565)

Signed-off-by: Sanju C Sudhakaran <scsudhakaran@habana.ai>
---
 requirements-hpu.txt                        |  2 +-
 vllm/lora/layers.py                         |  6 ++
 vllm/lora/punica_wrapper/punica_hpu.py      | 87 +++++++++++++++++++++
 vllm/lora/punica_wrapper/punica_selector.py |  5 ++
 vllm/worker/hpu_model_runner.py             | 21 ++---
 5 files changed, 107 insertions(+), 14 deletions(-)
 create mode 100644 vllm/lora/punica_wrapper/punica_hpu.py

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 17d40d0ee131a..f4fb89ef42834 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 38cb846578d5c..a6c93a3d8bfe9 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -30,6 +30,7 @@
     LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.lora.punica_wrapper import PunicaWrapperBase
@@ -1068,6 +1069,11 @@ def _get_logits(
         ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
                                                       posinf=float("inf"),
                                                       neginf=float("-inf")))
+
+        # HPU needs special handling to prune out dummy samples.
+        if current_platform.is_hpu():
+            lora_logits = lora_logits[:logits.shape[0], :]
+
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
new file mode 100644
index 0000000000000..996325b712996
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -0,0 +1,87 @@
+from typing import Optional, Tuple, Union, final
+
+import torch
+from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                    dispatch_bgmv_linear)
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperHPU(PunicaWrapperBase):
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        # Increasing max_num_batched_tokens by 3x to handle increase in
+        # tensor size due to padding.
+        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
+                                   max_batches, device)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_input: bool = True,
+                           **kwargs) -> None:
+        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+
+        for slice_idx in range(len(output_slices)):
+            dispatch_bgmv_linear(
+                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
+                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        offset_start: int = 0,
+        add_input=True,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index df6c1bdc7dd71..cd64878d95ae3 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -10,5 +10,10 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
         print_info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
+    elif current_platform.is_hpu():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
+        print_info_once("Using PunicaWrapperHPU.")
+        return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 99cf9a7e67256..9d479f412af46 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -622,6 +622,10 @@ def load_model(self) -> None:
                 assert hasattr(
                     self.model, "embedding_padding_modules"
                 ), "Model does not have embedding_padding_modules"
+                assert not self.lora_config.bias_enabled, \
+                    "Bias support in LoRA is not enabled in HPU yet."
+                assert not self.lora_config.fully_sharded_loras, \
+                    "Fully sharded LoRAs is not enabled in HPU yet."
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
                     self.scheduler_config.max_num_batched_tokens,
@@ -1282,11 +1286,9 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1]
-        max_seq_len = min(
-            self.bucketing_global_state.prompt_seq_bucket_cfg[-1],
-            self.max_num_batched_tokens // max_batch_size)
-
+        max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
+        max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
+                             self.scheduler_config.max_num_seqs)
         self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
                              False, True)
         return
@@ -1304,7 +1306,6 @@ def warmup_scenario(self,
                          f"bs{batch_size}_"
                          f"seq{seq_len}_"
                          f"graphs{'T' if use_graphs else 'F'}")
-        max_num_seqs = self.scheduler_config.max_num_seqs
         # This represents the maximum number of different requests
         # that will have unique loras, an therefore the max amount of memory
         # consumption create dummy lora request copies from the lora request
@@ -1326,16 +1327,10 @@ def warmup_scenario(self,
                     dummy_lora_requests.append(dummy_lora_request)
                 dummy_lora_requests_per_seq = [
                     dummy_lora_requests[idx % len(dummy_lora_requests)]
-                    for idx in range(max_num_seqs)
+                    for idx in range(batch_size)
                 ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs or is_pt_profiler_run else 1
-        if self.lora_config and not is_lora_profile_run:
-            lora_mapping = LoRAMapping(
-                **dict(index_mapping=[0] * batch_size * seq_len,
-                       prompt_mapping=[0] * batch_size * seq_len,
-                       is_prefill=is_prompt))
-            self.set_active_loras(set(), lora_mapping)
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(

From 62de37a38ed4a3877f3b1607b7163135f7ab9e36 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 12 Dec 2024 01:04:19 -0800
Subject: [PATCH 15/85] [core][distributed] initialization from
 StatelessProcessGroup (#10986)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  6 +-
 tests/distributed/test_same_node.py           | 29 ++++++-
 tests/distributed/test_shm_broadcast.py       | 84 ++++++++++++-------
 .../device_communicators/shm_broadcast.py     | 39 ++++++---
 vllm/distributed/parallel_state.py            | 64 +++++++++-----
 5 files changed, 153 insertions(+), 69 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index aca505178df06..6a6ee3cf713ae 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -432,11 +432,11 @@ steps:
   - tests/distributed/
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 
 - label: Distributed Tests (2 GPUs) # 40min
   #mirror_hardwares: [amd]
@@ -455,7 +455,7 @@ steps:
   commands:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index defc4e23c8ce2..62311a626bc47 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -3,11 +3,32 @@
 import torch.distributed as dist
 
 from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port
 
 if __name__ == "__main__":
     dist.init_process_group(backend="gloo")
-    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
 
-    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-    assert test_result == expected, f"Expected {expected}, got {test_result}"
-    print("Same node test passed!")
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+        test_result = all(in_the_same_node_as(pg, source_rank=0))
+
+        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+        assert test_result == expected, \
+            f"Expected {expected}, got {test_result}"
+        if pg == dist.group.WORLD:
+            print("Same node test passed! when using torch distributed!")
+        else:
+            print("Same node test passed! when using StatelessProcessGroup!")
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 2761b7f6c0644..723872682cf97 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -7,7 +7,8 @@
 import torch.distributed as dist
 
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
-from vllm.utils import update_environment_variables
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port, update_environment_variables
 
 
 def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
@@ -54,34 +55,61 @@ def wrapped_fn(env):
 
 @worker_fn_wrapper
 def worker_fn():
-    writer_rank = 2
-    broadcaster = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank)
-    if dist.get_rank() == writer_rank:
-        seed = random.randint(0, 1000)
-        dist.broadcast_object_list([seed], writer_rank)
-    else:
-        recv = [None]
-        dist.broadcast_object_list(recv, writer_rank)
-        seed = recv[0]  # type: ignore
-    dist.barrier()
-    # in case we find a race condition
-    # print the seed so that we can reproduce the error
-    print(f"Rank {dist.get_rank()} got seed {seed}")
-    # test broadcasting with about 400MB of data
-    N = 10_000
-    if dist.get_rank() == writer_rank:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            broadcaster.broadcast_object(x)
-            time.sleep(random.random() / 1000)
+
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
     else:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            y = broadcaster.broadcast_object(None)
-            assert np.array_equal(x, y)
-            time.sleep(random.random() / 1000)
-    dist.barrier()
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+
+        writer_rank = 2
+        broadcaster = MessageQueue.create_from_process_group(
+            pg, 40 * 1024, 2, writer_rank)
+        if rank == writer_rank:
+            seed = random.randint(0, 1000)
+            dist.broadcast_object_list([seed], writer_rank)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, writer_rank)
+            seed = recv[0]  # type: ignore
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+        else:
+            pg.barrier()
+
+        # in case we find a race condition
+        # print the seed so that we can reproduce the error
+        print(f"Rank {rank} got seed {seed}")
+        # test broadcasting with about 400MB of data
+        N = 10_000
+        if rank == writer_rank:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                broadcaster.broadcast_object(x)
+                time.sleep(random.random() / 1000)
+        else:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                y = broadcaster.broadcast_object(None)
+                assert np.array_equal(x, y)
+                time.sleep(random.random() / 1000)
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+            print("torch distributed passed the test!")
+        else:
+            pg.barrier()
+            print("StatelessProcessGroup passed the test!")
 
 
 def test_shm_broadcast():
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 9a2d8918d96e5..9f97b0f01ad8a 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -5,7 +5,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
@@ -15,6 +15,7 @@
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils import get_ip, get_open_port, is_valid_ipv6_address
 
@@ -476,13 +477,19 @@ def broadcast_object(self, obj=None):
             return self.dequeue()
 
     @staticmethod
-    def create_from_process_group(pg: ProcessGroup,
+    def create_from_process_group(pg: Union[ProcessGroup,
+                                            StatelessProcessGroup],
                                   max_chunk_bytes,
                                   max_chunks,
                                   writer_rank=0) -> "MessageQueue":
-        group_rank = dist.get_rank(pg)
-        group_world_size = dist.get_world_size(pg)
-        global_ranks = dist.get_process_group_ranks(pg)
+        if isinstance(pg, ProcessGroup):
+            group_rank = dist.get_rank(pg)
+            group_world_size = dist.get_world_size(pg)
+            global_ranks = dist.get_process_group_ranks(pg)
+        else:
+            group_rank = pg.rank
+            group_world_size = pg.world_size
+            global_ranks = list(range(pg.world_size))
 
         from vllm.distributed.parallel_state import in_the_same_node_as
         status = in_the_same_node_as(pg, source_rank=writer_rank)
@@ -500,15 +507,21 @@ def create_from_process_group(pg: ProcessGroup,
                 max_chunks=max_chunks,
             )
             handle = buffer_io.export_handle()
-            dist.broadcast_object_list([handle],
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
+            if isinstance(pg, ProcessGroup):
+                dist.broadcast_object_list([handle],
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+            else:
+                pg.broadcast_obj(handle, writer_rank)
         else:
-            recv = [None]
-            dist.broadcast_object_list(recv,
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
-            handle = recv[0]  # type: ignore
+            if isinstance(pg, ProcessGroup):
+                recv = [None]
+                dist.broadcast_object_list(recv,
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+                handle = recv[0]  # type: ignore
+            else:
+                handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
         buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 34815d7f0aa78..5b9236f8c56b6 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,6 +37,7 @@
 
 import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
@@ -1191,25 +1192,31 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         torch.cuda.empty_cache()
 
 
-def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
+def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
+                        source_rank: int = 0) -> List[bool]:
     """
     This is a collective operation that returns if each rank is in the same node
     as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
     """
-    assert torch.distributed.get_backend(
-        pg) != torch.distributed.Backend.NCCL, (
-            "in_the_same_node_as should be tested with a non-NCCL group.")
-    # local rank inside the group
-    rank = torch.distributed.get_rank(group=pg)
-    world_size = torch.distributed.get_world_size(group=pg)
+    if isinstance(pg, ProcessGroup):
+        assert torch.distributed.get_backend(
+            pg) != torch.distributed.Backend.NCCL, (
+                "in_the_same_node_as should be tested with a non-NCCL group.")
+        # local rank inside the group
+        rank = torch.distributed.get_rank(group=pg)
+        world_size = torch.distributed.get_world_size(group=pg)
+
+        # global ranks of the processes in the group
+        ranks = torch.distributed.get_process_group_ranks(pg)
+    else:
+        rank = pg.rank
+        world_size = pg.world_size
+        ranks = list(range(world_size))
 
     # local tensor in each process to store the result
     is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
 
-    # global ranks of the processes in the group
-    ranks = torch.distributed.get_process_group_ranks(pg)
-
     magic_message = b"magic_message"
     shm = None
 
@@ -1219,17 +1226,21 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
                 # create a shared memory segment
                 shm = shared_memory.SharedMemory(create=True, size=128)
                 shm.buf[:len(magic_message)] = magic_message
-                torch.distributed.broadcast_object_list([shm.name],
-                                                        src=ranks[source_rank],
-                                                        group=pg)
+                if isinstance(pg, ProcessGroup):
+                    torch.distributed.broadcast_object_list(
+                        [shm.name], src=ranks[source_rank], group=pg)
+                else:
+                    pg.broadcast_obj(shm.name, src=source_rank)
                 is_in_the_same_node[rank] = 1
             else:
                 # try to open the shared memory segment
-                recv = [None]
-                torch.distributed.broadcast_object_list(recv,
-                                                        src=ranks[source_rank],
-                                                        group=pg)
-                name = recv[0]
+                if isinstance(pg, ProcessGroup):
+                    recv = [None]
+                    torch.distributed.broadcast_object_list(
+                        recv, src=ranks[source_rank], group=pg)
+                    name = recv[0]
+                else:
+                    name = pg.broadcast_obj(None, src=source_rank)
                 # fix to https://stackoverflow.com/q/62748654/9191338
                 # Python incorrectly tracks shared memory even if it is not
                 # created by the process. The following patch is a workaround.
@@ -1244,12 +1255,23 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
         if shm:
             shm.close()
 
-    torch.distributed.barrier(group=pg)
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.barrier(group=pg)
+    else:
+        pg.barrier()
 
     # clean up the shared memory segment
     with contextlib.suppress(OSError):
         if rank == source_rank and shm:
             shm.unlink()
-    torch.distributed.all_reduce(is_in_the_same_node, group=pg)
 
-    return [x == 1 for x in is_in_the_same_node.tolist()]
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+        aggregated_data = is_in_the_same_node
+    else:
+        aggregated_data = torch.zeros_like(is_in_the_same_node)
+        for i in range(world_size):
+            rank_data = pg.broadcast_obj(is_in_the_same_node, src=i)
+            aggregated_data += rank_data
+
+    return [x == 1 for x in aggregated_data.tolist()]

From 85362f028c0324d8d00b0438f29c3d9f64737b9a Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Thu, 12 Dec 2024 01:25:16 -0800
Subject: [PATCH 16/85] [Misc][LoRA] Ensure Lora Adapter requests return
 adapter name (#11094)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/openai/test_serving_engine.py | 11 +++++++++++
 vllm/entrypoints/openai/serving_chat.py         | 14 ++++++++------
 vllm/entrypoints/openai/serving_completion.py   |  2 +-
 vllm/entrypoints/openai/serving_engine.py       | 13 +++++++++++++
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 6199a75b5b4f8..096ab6fa0ac09 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -9,6 +9,7 @@
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-2-7b"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@@ -33,6 +34,16 @@ async def _async_serving_engine_init():
     return serving_engine
 
 
+@pytest.mark.asyncio
+async def test_serving_model_name():
+    serving_engine = await _async_serving_engine_init()
+    assert serving_engine._get_model_name(None) == MODEL_NAME
+    request = LoRARequest(lora_name="adapter",
+                          lora_path="/path/to/adapter2",
+                          lora_int_id=1)
+    assert serving_engine._get_model_name(request) == request.lora_name
+
+
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
     serving_engine = await _async_serving_engine_init()
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 0738210e27cb6..a5e7b4ac3bb30 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -123,6 +123,8 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
+            model_name = self._get_model_name(lora_request)
+
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
             tool_parser = self.tool_parser
@@ -238,13 +240,13 @@ async def create_chat_completion(
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
 
         try:
             return await self.chat_completion_full_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -259,11 +261,11 @@ async def chat_completion_stream_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         chunk_object_type: Final = "chat.completion.chunk"
         first_iteration = True
@@ -604,12 +606,12 @@ async def chat_completion_full_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         final_res: Optional[RequestOutput] = None
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index ee97d35f2b087..b3436773062f3 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -85,7 +85,6 @@ async def create_completion(
             return self.create_error_response(
                 "suffix is not currently supported")
 
-        model_name = self.base_model_paths[0].name
         request_id = f"cmpl-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
@@ -162,6 +161,7 @@ async def create_completion(
         result_generator = merge_async_iterators(
             *generators, is_cancelled=raw_request.is_disconnected)
 
+        model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 63f27b955461e..d5ad4354c78be 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -661,3 +661,16 @@ async def unload_lora_adapter(
 
     def _is_model_supported(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
+
+    def _get_model_name(self, lora: Optional[LoRARequest]):
+        """
+        Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora is not None:
+            return lora.lora_name
+        return self.base_model_paths[0].name

From 4816d20aa43fdc4abf66c28f6690a1953d8adbe9 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 12 Dec 2024 07:51:53 -0800
Subject: [PATCH 17/85] [V1] Fix torch profiling for offline inference (#11125)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_with_profiler.py | 31 +++++++++++++--------
 vllm/v1/engine/core_client.py               |  4 +--
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py
index 1f00d26808771..abcfa8e8f2f2a 100644
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 from vllm import LLM, SamplingParams
 
@@ -15,19 +16,25 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+if __name__ == "__main__":
 
-llm.start_profile()
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    llm.start_profile()
 
-llm.stop_profile()
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
 
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    llm.stop_profile()
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 8eb9a27438d53..a66ae111be8c5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -105,7 +105,7 @@ def shutdown(self):
     def __del__(self):
         self.shutdown()
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
         self.engine_core.profile(is_start)
 
 
@@ -212,7 +212,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
                          EngineCoreProfile(is_start))
 

From d4d5291cc216ccc7ce824c03ab25141026b9a394 Mon Sep 17 00:00:00 2001
From: Ramon Ziai <ramon.ziai@bettermarks.com>
Date: Thu, 12 Dec 2024 18:36:32 +0100
Subject: [PATCH 18/85] fix(docs): typo in helm install instructions (#11141)

Signed-off-by: Ramon Ziai <ramon.ziai@bettermarks.com>
---
 docs/source/serving/deploying_with_helm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.rst
index 21b17e881b945..d185a6951d7ec 100644
--- a/docs/source/serving/deploying_with_helm.rst
+++ b/docs/source/serving/deploying_with_helm.rst
@@ -25,7 +25,7 @@ To install the chart with the release name ``test-vllm``:
 
 .. code-block:: console
 
-    helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3buckername=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+    helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
 
 Uninstalling the Chart
 ----------------------

From 5d712571afd87f5cc1b6c6c25feaac5c706f3712 Mon Sep 17 00:00:00 2001
From: Jeff Cook <jeff@jeffcook.io>
Date: Thu, 12 Dec 2024 11:09:20 -0700
Subject: [PATCH 19/85] [Bugfix] Quick fix to make Pixtral-HF load correctly
 again after 39e227c7ae. (#11024)

---
 vllm/model_executor/models/llava.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 65c6bd07bfff0..53eef72dd5f91 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -218,12 +218,8 @@ def _get_dummy_mm_kwargs(
         image_processor = hf_processor.image_processor  # type: ignore
         hf_inputs = image_processor.preprocess(data['image'],
                                                return_tensors="pt")
-        is_pixtral = isinstance(hf_processor, PixtralProcessor)
 
-        return MultiModalKwargs(
-            **hf_inputs,
-            is_pixtral=torch.tensor(is_pixtral),
-        )
+        return MultiModalKwargs(**hf_inputs)
 
 
 class LlavaLikeConfig(Protocol):

From 2c97eca1fff5297089794d2bd8ebd0bf98e12641 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 12 Dec 2024 10:34:26 -0800
Subject: [PATCH 20/85] [Misc] Validate grammar and fail early (#11119)

---
 .../guided_decoding/xgrammar_decoding.py      | 32 +++++++++++++------
 .../guided_decoding/xgrammar_utils.py         | 12 +++----
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 80e88dd5b4b37..fc45e37cf6f06 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -131,22 +131,25 @@ def from_guided_params(cls,
                            max_threads: int = 8) -> GrammarConfig:
 
         tokenizer_hash = hash(tokenizer)
-        # Only get tokenizer data if not already cached
-        if tokenizer_hash in TokenizerDataCache._cache:
-            encoded_vocab = None
-            stop_token_ids = None
-            backend_str = None
-        else:
-            tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
-            encoded_vocab = tokenizer_data.encoded_vocab
-            stop_token_ids = tokenizer_data.stop_token_ids
-            backend_str = tokenizer_data.backend_str
+        tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
+        encoded_vocab = tokenizer_data.encoded_vocab
+        stop_token_ids = tokenizer_data.stop_token_ids
+        backend_str = tokenizer_data.backend_str
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
                 json_str = json.dumps(guided_params.json)
             else:
                 json_str = guided_params.json
+
+            # Validate the schema and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_json_schema(json_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
             return cls(json_str=json_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
                        encoded_vocab=encoded_vocab,
@@ -167,6 +170,15 @@ def from_guided_params(cls,
                         f"Conversion error: {str(e)}") from e
             else:
                 grammar_str = guided_params.grammar
+
+            # Validate the grammar and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_ebnf(grammar_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
             return cls(grammar_str=grammar_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
                        encoded_vocab=encoded_vocab,
diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/xgrammar_utils.py
index 12b42245f4e3d..9a0463964de49 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_utils.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_utils.py
@@ -26,15 +26,11 @@ def grammar_is_likely_lark(grammar_str: str) -> bool:
         if not line:
             continue
 
-        # Look for Lark-style rule definitions
-        if ':' in line and '::=' not in line:
-            return True
+        # Look for GBNF rule definition
+        if '::=' in line:
+            return False
 
-        # Look for Lark-specific features
-        if any(pattern in line for pattern in ['?start:', '|', '~']):
-            return True
-
-    return False
+    return True
 
 
 def convert_lark_to_gbnf(grammar_str: str) -> str:

From 9f3974a31911b551d416bb4d435273409d23f021 Mon Sep 17 00:00:00 2001
From: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:05:57 -0600
Subject: [PATCH 21/85] Fix logging of the vLLM Config (#11143)

---
 vllm/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 9be30c635cb2c..d756f71e4fa53 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -248,7 +248,7 @@ def __init__(
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: %r,"
+            "Initializing an LLM engine (v%s) with config: %s, "
             "use_cached_outputs=%s, ",
             VLLM_VERSION,
             vllm_config,

From db6c264a1e658e37782570f5155c77be0d41f312 Mon Sep 17 00:00:00 2001
From: shangmingc <csmthu@gmail.com>
Date: Fri, 13 Dec 2024 05:19:17 +0800
Subject: [PATCH 22/85] [Bugfix] Fix value unpack error of simple connector for
 KVCache transfer. (#11058)

Signed-off-by: ShangmingCai <csmthu@gmail.com>
---
 .../kv_transfer/kv_connector/simple_connector.py          | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 5870070a54c75..bf4f40ca94e29 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -118,6 +118,12 @@ def send_kv_caches_and_hidden_states(
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
 
+        model_config = model_executable.model.config
+        num_heads = model_config.num_key_value_heads
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+        head_size = int(hidden_size / num_attention_heads)
+
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
         # FIXME(Kuntai): This assume that all requests are prefill.
@@ -131,8 +137,6 @@ def send_kv_caches_and_hidden_states(
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
 
-                _, _, num_heads, head_size = kv_cache[0].shape
-
                 key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
                 value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
 

From 78ed8f57d8815cdd5567533f7d3e25b959d861ab Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 12 Dec 2024 16:57:40 -0800
Subject: [PATCH 23/85] [Misc][V1] Fix type in v1 prefix caching (#11151)

---
 tests/v1/core/test_prefix_caching.py | 12 ++++++++----
 vllm/v1/core/kv_cache_manager.py     |  8 ++++----
 vllm/v1/core/kv_cache_utils.py       | 22 +++++++++++++++-------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index b44d3e5cb0678..00f7b0fcfe1dc 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -49,7 +49,7 @@ def test_prefill():
         block_hash = hash_block_tokens(parent_block_hash, block_tokens)
         assert manager.block_pool[block_id].block_hash == block_hash
         assert manager.block_pool[block_id].ref_cnt == 1
-        parent_block_hash = block_hash
+        parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
     for block_id in (3, 4):
@@ -360,11 +360,15 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     assert not computed_blocks
     # Just ask for 1 block.
     blocks = manager.allocate_slots(req, block_size, computed_blocks)
+    req.num_computed_tokens = block_size
     assert len(blocks) == 1 + num_preallocated_blocks
 
-    # Append slots to the block.
-    req.num_computed_tokens = block_size * len(blocks)  # Assume all used.
-    blocks = manager.append_slots(req, block_size)  # Append 1 block.
+    # Assume all computed.
+    manager.append_slots(req, block_size * (len(blocks) - 1))
+    req.num_computed_tokens = block_size * len(blocks)
+
+    # Append 1 block.
+    blocks = manager.append_slots(req, block_size)
     assert len(blocks) == 1 + num_preallocated_blocks
 
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b492a755e6dd5..03cbb958237df 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -375,8 +375,8 @@ def _cache_full_blocks(
             prev_block: The previous block in the chain.
         """
         # Update the new blocks with the block hashes through the chain.
-        prev_block_hash = (prev_block.block_hash
-                           if prev_block is not None else None)
+        prev_block_hash_value = (prev_block.block_hash.hash_value
+                                 if prev_block is not None else None)
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 
@@ -390,10 +390,10 @@ def _cache_full_blocks(
                 f"{request.request_id}({request})")
 
             # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash,
+            block_hash = hash_block_tokens(prev_block_hash_value,
                                            tuple(block_tokens))
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
             self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
-            prev_block_hash = block_hash
+            prev_block_hash_value = block_hash.hash_value
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index fb666c364bfb2..814e462a91fed 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,12 +1,19 @@
 """KV-Cache Utilities."""
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import List, NamedTuple, Optional, Tuple
 
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
-BlockHashType = Tuple[int, Tuple[int]]
+
+class BlockHashType(NamedTuple):
+    """Hash value of a block and the token IDs in the block.
+    The reason we keep a tuple of token IDs is to make sure no hash
+    collision happens when the hash value is the same.
+    """
+    hash_value: int
+    token_ids: Tuple[int]
 
 
 @dataclass
@@ -171,8 +178,8 @@ def hash_block_tokens(parent_block_hash: Optional[int],
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
-    return (hash(
-        (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids)
+    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
+                         curr_block_token_ids)
 
 
 def hash_request_tokens(block_size: int,
@@ -188,14 +195,15 @@ def hash_request_tokens(block_size: int,
         The list of computed hash values.
     """
     ret = []
-    parent_block_hash = None
+    parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
         end = start + block_size
         block_token_ids = tuple(token_ids[start:end])
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
-        block_hash = hash_block_tokens(parent_block_hash, block_token_ids)
+        block_hash = hash_block_tokens(parent_block_hash_value,
+                                       block_token_ids)
         ret.append(block_hash)
-        parent_block_hash = block_hash
+        parent_block_hash_value = block_hash.hash_value
     return ret

From 30870b4f66414020645608b81dced94d8a99111c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Thu, 12 Dec 2024 22:19:23 -0500
Subject: [PATCH 24/85] [torch.compile] Dynamic fp8 + rms_norm fusion (#10906)

Signed-off-by: luka <luka@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 CMakeLists.txt                                |   3 +-
 .../fused_kernels/layernorm_rms_benchmarks.py | 173 +++++
 csrc/dispatch_utils.h                         |  14 +
 csrc/ops.h                                    |   8 +
 csrc/quantization/fp8/common.cuh              |  26 +-
 ...fused_layernorm_dynamic_per_token_quant.cu | 160 ++++
 .../fused_kernels/layernorm_utils.cuh         | 327 ++++++++
 .../fused_kernels/quant_conversions.cuh       |  81 ++
 csrc/quantization/vectorization.cuh           |  33 +
 csrc/torch_bindings.cpp                       |   8 +
 tests/compile/test_functionalization.py       |  21 +-
 tests/compile/test_fusion.py                  |  61 +-
 tests/kernels/test_fused_quant_layernorm.py   | 171 +++++
 vllm/_custom_ops.py                           |  20 +
 vllm/compilation/fix_functionalization.py     |   9 +-
 vllm/compilation/fusion.py                    | 719 +++++++++++++-----
 vllm/compilation/fx_utils.py                  |  42 +
 vllm/compilation/multi_output_match.py        | 105 +++
 vllm/compilation/reshapes.py                  |   3 +-
 vllm/compilation/vllm_inductor_pass.py        |   4 -
 20 files changed, 1736 insertions(+), 252 deletions(-)
 create mode 100644 benchmarks/fused_kernels/layernorm_rms_benchmarks.py
 create mode 100644 csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
 create mode 100644 csrc/quantization/fused_kernels/layernorm_utils.cuh
 create mode 100644 csrc/quantization/fused_kernels/quant_conversions.cuh
 create mode 100644 csrc/quantization/vectorization.cuh
 create mode 100644 tests/kernels/test_fused_quant_layernorm.py
 create mode 100644 vllm/compilation/fx_utils.py
 create mode 100644 vllm/compilation/multi_output_match.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c78cdc77a7e42..bf19b3d227171 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
@@ -300,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
     "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
new file mode 100644
index 0000000000000..ef91f9f8eb529
--- /dev/null
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -0,0 +1,173 @@
+import pickle as pkl
+import time
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Iterable, List, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@dataclass
+class bench_params_t:
+    num_tokens: int
+    hidden_size: int
+    add_residual: bool
+    dtype: torch.dtype
+
+    def description(self):
+        return (f'N {self.num_tokens} '
+                f'x D {self.hidden_size} '
+                f'x R {self.add_residual} '
+                f'x DT {self.dtype}')
+
+
+def get_bench_params() -> List[bench_params_t]:
+    ## Test Fixtures
+    NUM_TOKENS = [2**x for x in range(11)]
+    HIDDEN_SIZES = list(range(1024, 8129, 1024))
+    ADD_RESIDUAL = [True, False]
+    DTYPES = [torch.bfloat16, torch.float]
+
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    bench_params = list(map(lambda x: \
+        bench_params_t(x[0], x[1], x[2], x[3]), combinations))
+    return bench_params
+
+
+# Reference impls
+def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                      residual: Optional[torch.Tensor],
+                      quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _, _ = ops.scaled_int8_quant(torch_out)
+
+
+def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                     residual: Optional[torch.Tensor],
+                     quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = ops.scaled_fp8_quant(torch_out)
+
+
+def fused_impl(
+        rms_norm_layer: RMSNorm,  # this stores the weights
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        quant_dtype: torch.dtype):
+    out, _ = ops.rms_norm_dynamic_per_token_quant(x,
+                                                  rms_norm_layer.weight,
+                                                  1e-6,
+                                                  quant_dtype,
+                                                  residual=residual)
+
+
+# Bench functions
+def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
+             quant_dtype: torch.dtype, label: str, sub_label: str,
+             fn: Callable, description: str) -> TMeasurement:
+
+    min_run_time = 1
+
+    globals = {
+        "rms_norm_layer": rms_norm_layer,
+        "x": x,
+        "residual": residual,
+        "quant_dtype": quant_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+def bench(params: bench_params_t, label: str, sub_label: str) \
+        -> Iterable[TMeasurement]:
+
+    # Make inputs
+    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    # Make inputs
+    scale = 1 / params.hidden_size
+    x = torch.randn(params.num_tokens,
+                    params.hidden_size,
+                    dtype=params.dtype,
+                    device='cuda') * scale
+    residual = (torch.randn_like(x) * scale).to(device='cuda') \
+            if params.add_residual else None
+
+    timers = []
+
+    # unfused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label,
+                 unfused_int8_impl, "unfused_int8_impl"))
+
+    # unfused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 unfused_fp8_impl, "unfused_fp8_impl"))
+
+    # fused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
+                 "fused_int8_impl"))
+
+    # fused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 fused_impl, "fused_fp8_impl"))
+
+    print_timers(timers)
+
+    return timers
+
+
+# launch bench
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def main():
+    torch.set_default_device('cuda')
+    bench_params = get_bench_params()
+
+    timers = []
+    for bp in tqdm(bench_params):
+        timers.extend(
+            bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+    print_timers(timers)
+
+    # pickle all the results
+    timestamp = int(time.time())
+    with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
+        pkl.dump(timers, f)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index a634e1c3d4886..03414b7e1ae93 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -14,6 +14,20 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+// TODO(luka/varun): use FP8_TYPE macro after refactoring
+#ifndef USE_ROCM
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
diff --git a/csrc/ops.h b/csrc/ops.h
index ea001190bc202..816b471d062d2 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -66,6 +66,14 @@ void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
                                          torch::Tensor& weight,
                                          torch::Tensor& scale, double epsilon);
 
+void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
+                                      torch::Tensor const& input,
+                                      torch::Tensor const& weight,
+                                      torch::Tensor& scales,
+                                      double const epsilon,
+                                      std::optional<torch::Tensor> scale_ub,
+                                      std::optional<torch::Tensor> residual);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d7c0297d5333f..15bd5b6ed1564 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -1,6 +1,9 @@
 #pragma once
 
+#include "quantization/vectorization.cuh"
+
 #include <cmath>
+#include <c10/core/ScalarType.h>
 
 #ifndef USE_ROCM
   #include <c10/util/Float8_e4m3fn.h>
@@ -15,6 +18,7 @@ using FP8_TYPE = c10::Float8_e4m3fnuz;
 // issue when running dynamic quantization. Here use 224.0f for rocm.
 constexpr auto FP8_E4M3_MAX = 224.0f;
 #endif
+constexpr static auto kFp8Type = c10::CppTypeToScalarType<FP8_TYPE>::value;
 
 namespace vllm {
 
@@ -89,22 +93,6 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   }
 }
 
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
-};
-
-typedef struct __align__(4) {
-  FP8_TYPE x;
-  FP8_TYPE y;
-  FP8_TYPE z;
-  FP8_TYPE w;
-}
-float8x4_t;
-
 template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
@@ -139,10 +127,10 @@ __device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
+  using float8x4_t = q8x4_t<FP8_TYPE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
 
   int64_t const num_vec_elems = num_elems >> 2;
 
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
new file mode 100644
index 0000000000000..3c4f183bf4b59
--- /dev/null
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -0,0 +1,160 @@
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../dispatch_utils.h"
+#include "layernorm_utils.cuh"
+#include "quant_conversions.cuh"
+
+namespace vllm {
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void rms_norm_dynamic_per_token_quant_vec(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute rms
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute scale
+  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
+                                                     has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
+                                     has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert token_scale for exact match with FBGemm
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
+                                     has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__global__ void rms_norm_dynamic_per_token_quant_kernel(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;
+
+  if (can_vectorize) {
+    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
+                                                has_residual>(
+        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
+        hidden_size, residual);
+  }
+
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute RMS
+  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
+                                            var_epsilon, residual);
+  // Compute Scale
+  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert s_token_scale for exact match with FBGemm
+    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+}  // namespace vllm
+
+// Residual add + RMS norm + dynamic per token
+template <typename scalar_in_t>
+void rms_norm_dynamic_per_token_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual) {
+  int32_t hidden_size = input.size(-1);
+  int32_t num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const float min_scaling_factor =
+      out.dtype() == torch::kInt8
+          ? std::numeric_limits<float>::epsilon()
+          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
+
+  if (residual.has_value()) {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        true>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size,
+                  residual->data_ptr<scalar_in_t>());
+        });
+
+  } else {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        false>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
+        });
+  }
+}
+
+void rms_norm_dynamic_per_token_quant(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
+        rms_norm_dynamic_per_token_quant_dispatch<scalar_t>(
+            out, input, weight, scales, var_epsilon, scale_ub, residual);
+      });
+}
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
new file mode 100644
index 0000000000000..cec6b54edb569
--- /dev/null
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -0,0 +1,327 @@
+#pragma once
+
+/**
+ * __device__ layernorm utilities.
+ */
+
+#include "quantization/vectorization.cuh"
+#include "quant_conversions.cuh"
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// has_residual must be true, if residual is not a nullptr
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  // sum of squares
+  float ss = 0.0f;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    ss += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  float block_absmax_val_maybe = 0.0f;
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // Shared memory store
+    all_token_scales[blockIdx.x] = scale;  // Global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+      residual[token_offset + i] = static_cast<scalar_t>(x);
+    }
+    // Norm
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    // Quant
+    output[token_offset + i] =
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
+  }
+}
+
+namespace vectorized {
+
+// Compute 1.0/rms(input)
+// hidden_size must be a multiple of 4
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  // sum of squares
+  float ss = 0.0f;
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    ss += x.x * x.x;
+    ss += x.y * x.y;
+    ss += x.z * x.z;
+    ss += x.w * x.w;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+// Vectorized version of vllm::compute_dynamic_per_token_scales
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+  float block_absmax_val_maybe = 0.0f;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // shared memory store
+    all_token_scales[blockIdx.x] = scale;  // global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  q8x4_t<scalar_out_t>* vec_output =
+      reinterpret_cast<q8x4_t<scalar_out_t>*>(&output[token_offset]);
+  vec4_t<scalar_t>* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
+  }
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+// TODO(luka/varun) extract into type-agnostic vectorized quant function to
+//  replace scaled_fp8_conversion_vec
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> const in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+      // Update residual
+      r.x = static_cast<scalar_t>(x.x);
+      r.y = static_cast<scalar_t>(x.y);
+      r.z = static_cast<scalar_t>(x.z);
+      r.w = static_cast<scalar_t>(x.w);
+      vec_residual[i] = r;
+    }
+
+    q8x4_t<scalar_out_t> out;
+    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.x * rms) * w.x, scale);
+    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.y * rms) * w.y, scale);
+    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.z * rms) * w.z, scale);
+    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.w * rms) * w.w, scale);
+    vec_output[i] = out;
+  }
+}
+
+}  // namespace vectorized
+
+}  // namespace vllm
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
new file mode 100644
index 0000000000000..f8a9872226a3a
--- /dev/null
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -0,0 +1,81 @@
+#pragma once
+
+/**
+ * __device__ helper functions to deal with float -> quant datatype conversion
+ */
+
+#include "quantization/vectorization.cuh"
+// TODO(luka/varun):refactor common.cuh to use this file instead
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+// TODO(luka/varun): combine into common utilities for int8
+//  (with int8_quant_kernels.cu)
+static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
+#ifdef USE_ROCM
+  static const float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  static const float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  // round
+  float dst = std::nearbyint(x);
+  // saturate
+  dst = std::clamp(dst, i8_min, i8_max);
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+static __device__ __forceinline__ FP8_TYPE float_to_fp8(float const x) {
+  float const r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+  return static_cast<FP8_TYPE>(r);
+}
+
+template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
+struct ScaledQuant;
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, int8_t>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_int8_rn(x * scale);
+    } else {
+      return float_to_int8_rn(x / scale);
+    }
+  }
+};
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, FP8_TYPE>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_fp8(x * scale);
+    } else {
+      return float_to_fp8(x / scale);
+    }
+  }
+};
+
+template <typename scalar_t, typename quant_type_t, bool is_scale_inverted>
+__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output,
+                                        scalar_t const* __restrict__ input,
+                                        float const scale, int const tid,
+                                        int const num_elements,
+                                        int const step) {
+  for (int i = tid; i < num_elements; i += step) {
+    output[i] = ScaledQuant<quant_type_t, is_scale_inverted>(input[i], scale);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
new file mode 100644
index 0000000000000..44c999130f756
--- /dev/null
+++ b/csrc/quantization/vectorization.cuh
@@ -0,0 +1,33 @@
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+// TODO(luka/varun) use FP8_TYPE instead after refactoring
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+namespace vllm {
+
+// Vectorization containers
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+template <typename quant_type_t>
+struct __align__(4) q8x4_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t x;
+  quant_type_t y;
+  quant_type_t z;
+  quant_type_t w;
+};
+
+}  // namespace vllm
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 4e64b9c92773a..1ffab14862fed 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -128,6 +128,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
            &fused_add_rms_norm_static_fp8_quant);
 
+  // Fused Layernorm + Quant kernels
+  ops.def(
+      "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual) -> ()");
+  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
+           &rms_norm_dynamic_per_token_quant);
+
   // Rotary embedding
   // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
   ops.def(
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 5036189077be2..ea3aaee9565ec 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -4,10 +4,10 @@
 import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import (FusionPass, find_auto_fn,
-                                     find_auto_fn_maybe)
+from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
+                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.reshapes import RedundantReshapesPass
-from vllm.compilation.vllm_inductor_pass import is_func
 from vllm.config import CompilationConfig
 
 from .backend import TestBackend
@@ -35,12 +35,16 @@
 ]
 
 
-@pytest.mark.parametrize("model",
-                         ["nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"])
+@pytest.mark.parametrize(
+    "model, quant_key",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
+      kFp8DynamicTokenSym)])
 @pytest.mark.parametrize("do_fusion", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fix_functionalization(model: str, do_fusion: bool):
+def test_fix_functionalization(model: str, quant_key: QuantKey,
+                               do_fusion: bool):
     torch.set_default_device("cuda")
 
     config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
@@ -78,8 +82,9 @@ def test_fix_functionalization(model: str, do_fusion: bool):
 
     # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
     # and replaced by fused quantized ops in RMS_QUANT_OPS.
-    ops = OPS_IN_MODEL + (RMS_QUANT_OPS["static_fp8"]
-                          if do_fusion else [RMS_OP])
+    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
+               ] if do_fusion else [RMS_OP]
+    ops = OPS_IN_MODEL + rms_ops
 
     for op in ops:
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index f92ec8d0de5f1..b4266a4a7db94 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -3,8 +3,9 @@
 from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
-from vllm.compilation.fusion import (FusionPass, find_auto_fn,
-                                     find_auto_fn_maybe)
+from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
+                                     FusionPass, QuantKey)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
 from vllm.compilation.reshapes import RedundantReshapesPass
 from vllm.config import CompilationConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -16,24 +17,37 @@
 
 class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, eps: float, *args, **kwargs):
+    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
-        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        if static:
+            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        else:
+            self.scale = [None for _ in range(2)]
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
 
     def forward(self, x):
-        resid = torch.relu(x)
+        resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1])
+        x2 = apply_fp8_linear(y,
+                              self.w[0],
+                              self.wscale[0],
+                              self.scale[0],
+                              use_per_token_if_dynamic=True)
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3])
+        x3 = apply_fp8_linear(y2,
+                              self.w[1],
+                              self.wscale[1],
+                              self.scale[1],
+                              use_per_token_if_dynamic=True)
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
@@ -42,14 +56,13 @@ def forward(self, x):
 @pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("static", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
     torch.set_default_device("cuda")
-    torch.set_default_dtype(torch.float16)
-
-    if eps != 1e-5:
-        pytest.skip("Only test eps=1e-5 for now")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(1)
 
     # Reshape pass is needed for the fusion pass to work
     config = CompilationConfig.PassConfig(enable_fusion=True,
@@ -58,7 +71,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
     fusion_pass = FusionPass.instance(config)
 
     backend = TestBackend(reshape_pass, fusion_pass)
-    model = TestModel(hidden_size, eps)
+    model = TestModel(hidden_size, eps, static)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size)
@@ -69,16 +82,28 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
     model2 = torch.compile(model, backend=backend)
     result2 = model2(x)
 
-    # Check that it gives the same answer
-    torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3)
+    # Higher tol for dynamic, even higher for bfloat16
+    if static:
+        ATOL, RTOL = (1e-3, 1e-3)
+    elif dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
 
     # Check substitution worked
     pre_nodes = backend.graph_pre_pass.nodes
     post_nodes = backend.graph_post_pass.nodes
 
-    rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default
-    add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+    # static is per-tensor, dynamic is per-token
+    key = QuantKey(dtype=FP8_DTYPE,
+                   static=static,
+                   per_tensor=static,
+                   symmetric=True)
+    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
+    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
+    fp8_quant = QUANT_OPS[key]
 
     # In pre-nodes, fp8 quant should be present and fused kernels should not
     assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
new file mode 100644
index 0000000000000..baf8d73fdbffb
--- /dev/null
+++ b/tests/kernels/test_fused_quant_layernorm.py
@@ -0,0 +1,171 @@
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+DTYPES = [torch.bfloat16, torch.float]
+QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
+VEC_HIDDEN_SIZES = range(1024, 1030)
+# Avoid combinatorial explosion with full Cartesian product
+NUM_TOKENS_HIDDEN_SIZES = [
+    *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
+    *[(83, i) for i in [1, 1033, 2048, 5120]],
+    *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
+    *[(4096, i) for i in [1, 64, 5137]],
+]
+
+ADD_RESIDUAL = [False, True]
+SCALE_UBS = [True, False]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+EPS = 1e-6
+
+## Helpers
+
+
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+
+
+def ref_rms_norm(rms_norm_layer: RMSNorm,
+                 x: torch.Tensor,
+                 residual: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+        out, residual = rms_norm_layer.forward_native(x, residual)
+    else:
+        out = rms_norm_layer.forward_native(x)
+
+    return out, residual
+
+
+def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if scale_ub is not None:
+        assert quant_dtype == torch.float8_e4m3fn
+
+    # Norm
+    torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
+
+    # Quant
+    if quant_dtype == torch.float8_e4m3fn:
+        torch_out, scales = ops.scaled_fp8_quant(torch_out,
+                                                 scale_ub=scale_ub,
+                                                 use_per_token_if_dynamic=True)
+    else:
+        assert quant_dtype == torch.int8
+        torch_out, scales = ops.scaled_int8_quant(torch_out)
+
+    return torch_out, scales, residual
+
+
+def ref_impl(rms_norm_layer: RMSNorm,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
+                                       residual, scale_ub)
+
+
+def ops_dynamic_per_token_quant(weight: torch.Tensor,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
+                                                       quant_dtype, scale_ub,
+                                                       residual)
+    return out, scales, residual
+
+
+def ops_impl(weight: torch.Tensor,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
+                                       scale_ub)
+
+
+@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    scale_ub: bool,
+    dtype: torch.dtype,
+    quant_dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+
+    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
+        # skip
+        return
+
+    layer = RMSNorm(hidden_size, EPS).to(dtype=dtype)
+
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+
+    # Make inputs
+    scale = 1 / (hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+    if scale_ub is not None:
+        rms_x, _ = ref_rms_norm(layer, x, residual)
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+
+    ref_out, ref_scales, ref_residual = \
+        ref_impl(layer, x, quant_dtype, residual, scale_ub)
+    ops_out, ops_scales, ops_residual = \
+        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+
+    assert ref_out.dtype == quant_dtype
+    assert ops_out.dtype == quant_dtype
+    assert torch.allclose(ref_scales, ops_scales)
+    if quant_dtype == torch.int8:
+        # big atol to account for round-off errors.
+        assert torch.allclose(ref_out, ops_out, atol=1)
+    else:
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32))
+    if add_residual:
+        assert torch.allclose(ref_residual, ops_residual)
+
+    output = torch.empty_like(x, dtype=quant_dtype)
+    scales = torch.empty((x.numel() // x.shape[-1], 1),
+                         device=x.device,
+                         dtype=torch.float32)
+
+    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
+            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index c192c9a7b0e4d..d6002630ee02c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -249,6 +249,26 @@ def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
         block_table_bound)
 
 
+# fused quant layer norm ops
+def rms_norm_dynamic_per_token_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    scale_ub: Optional[torch.Tensor] = None,
+    residual: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=quant_dtype)
+    scales = torch.empty((input.numel() // input.shape[-1], 1),
+                         device=input.device,
+                         dtype=torch.float32)
+
+    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
+                                                  scales, epsilon, scale_ub,
+                                                  residual)
+    return output, scales
+
+
 # quantization ops
 # awq
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 3584cc3608caf..e15d7b315c50f 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -6,7 +6,8 @@
 
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
@@ -53,14 +54,16 @@ def __call__(self, graph: torch.fx.Graph):
                 self.insert_defunctionalized(graph, node)
                 self._remove(node)
 
-            # These 2 replacements avoid the most copies for LLaMa.
+            # rms_norm replacements avoid the most copies for LLaMa.
             elif at_target == torch.ops._C.fused_add_rms_norm.default:
                 mutated_args = {1: 'input', 2: 'residual'}
                 self.defunctionalize(graph, node, mutated_args)
             elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
                 mutated_args = {1: 'result', 2: 'residual'}
                 self.defunctionalize(graph, node, mutated_args)
-
+            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
             elif at_target in [
                     torch.ops._C.rms_norm.default,
                     torch.ops._C.rms_norm_static_fp8_quant.default
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 5efa410fab6a0..cde27bd108212 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,129 +1,517 @@
-import operator
-from typing import Iterable, List, Optional
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
+import torch._inductor.pattern_matcher as pm
+# TODO(luka) use vllm.utils once #10836 landed
+from compressed_tensors.quantization import FP8_DTYPE
+from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
-from torch._inductor.pattern_matcher import (Match, PatternMatcherPass,
-                                             fwd_only, register_replacement)
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._ops import OpOverload
 
 from vllm.config import CompilationConfig
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import find_getitem_maybe
+from .multi_output_match import MultiOutputMatch
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
 
-def rms_pattern_static(result: torch.Tensor, result_rms: torch.Tensor,
-                       input: torch.Tensor, weight: torch.Tensor,
-                       scale: torch.Tensor):
-    at1 = auto_functionalized(torch.ops._C.rms_norm.default,
-                              result=result_rms,
-                              input=input,
-                              weight=weight,
-                              epsilon=1e-5)
-    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at1[1],
-                              scale=scale)
-
-    # result
-    return at2[1]
-
-
-def rms_replacement_static(result: torch.Tensor, result_rms: torch.Tensor,
-                           input: torch.Tensor, weight: torch.Tensor,
-                           scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.rms_norm_static_fp8_quant.default,
-                             result=result,
-                             input=input,
-                             weight=weight,
-                             scale=scale,
-                             epsilon=1e-5)
-
-    # result
-    return at[1]
-
-
-def rms_pattern_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                residual: torch.Tensor, weight: torch.Tensor,
-                                scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.fused_add_rms_norm.default,
-                             input=input,
-                             residual=residual,
-                             weight=weight,
-                             epsilon=1e-5)
-    at1 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at[1],
-                              scale=scale)
-
-    # result, residual
-    return at1[1], at[2]
-
-
-def rms_replacement_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                    residual: torch.Tensor,
-                                    weight: torch.Tensor, scale: torch.Tensor):
-    at = auto_functionalized(
-        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-        result=result,
-        input=input,
-        residual=residual,
-        weight=weight,
-        scale=scale,
-        epsilon=1e-5)
-    # result, residual
-    return at[1], at[2]
-
-
 def empty_bf16(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
 
 
-def empty_fp8(*args, **kwargs):
-    fp8 = torch.float8_e4m3fn
-    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
-
-
 def empty_fp32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
 
 
-# Utilities for post-processing multi-output matches
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+
+
+class QuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of quantization.
+    dtype: quantized data type
+    static: static quantization if True, dynamic if False
+    per_tensor: per-tensor quantization if True, per-token if False
+    symmetric: symmetric if True, asymmetric if False
+    """
+    dtype: torch.dtype
+    static: bool
+    per_tensor: bool = True
+    symmetric: bool = True
+
+    def __str__(self):
+        return (f"QuantKey({'static' if self.static else 'dynamic'},"
+                f"{fx.graph.dtype_abbrs[self.dtype]},"
+                f"{'per_tensor' if self.per_tensor else 'per_token'},"
+                f"{'a' if not self.symmetric else ''}symmetric)")
+
+
+kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, True, True)
+kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True)
+kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True)
+
+QUANT_OPS: Dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTensorSym:
+    torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTokenSym:
+    torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa
+}
+
+
+class FusedRMSQuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of RMSNorm + quant fusion.
+    quant: type of quantization
+    fused_add: does the op also perform the residual add
+    """
+    quant: QuantKey
+    fused_add: bool
+
+    def __str__(self):
+        return (f"FusedQuantKey({self.quant}, with"
+                f"{'' if self.fused_add else 'out'} residual)")
+
+
+FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = {
+    FusedRMSQuantKey(kFp8StaticTensorSym, False):
+    torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8StaticTensorSym, True):
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, False):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, True):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+}
+
+
+class QuantMultiOutputMatch(MultiOutputMatch):
+
+    def __init__(self, match: pm.Match, quant_op, fused_op):
+        super().__init__(match)
+        assert isinstance(quant_op, OpOverload)
+        assert isinstance(fused_op, OpOverload)
+        self.QUANT_OP = quant_op  # in-place quant op
+        self.FUSED_OP = fused_op  # in-place fused quant op
+
+    def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node,
+                                                                      int]],
+                          **kwargs):
+        """
+        This utility function inserts an auto-functionalized node for FUSED_OP.
+        It also correctly sets its meta value and rebinds the users of the
+        unfused nodes to use the fused node instead.
+
+        :param fused_return_mapping: A dictionary, mapping from getitem indices
+        of the fused node result to a tuple of the old node and a getitem index.
+        :param kwargs: kwargs that get directly forwarded to the auto_fn node
+
+        Example:
+        If we want to replace this graph:
+        _, x1, x2 = auto_fn(op1)
+        _, y1, y2 = auto_fn(op2)
+
+        with
+        _, x1, y2, x2 = auto_fn(FUSED_OP)
+
+        we would call:
+        insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)}
+
+        Note that the 0th element is None for auto-functionalized in-place ops.
+        Hence, others appear 1-indexed.
+        """
+        fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs)
+        indices = fused_return_mapping.keys()
+        getitem_nodes = self.insert_getitems(fused_node, indices)
+
+        # Prepare the meta value, use a list so it's mutable
+        meta_val = [None] * (max(indices) + 1)
+
+        # Iterate through elements of the tuple produced by fused_node
+        for idx, getitem_node in zip(indices, getitem_nodes):
+            old_node, old_idx = fused_return_mapping[idx]
+
+            # If the old value was never used, the old_getitem might not exist
+            old_getitem = find_getitem_maybe(old_node, old_idx)
+            if old_getitem is not None:
+                # Rebind the users of match getitem nodes to use the new nodes.
+                # The old nodes will be removed by DCE at the end of the pass.
+                old_getitem.replace_all_uses_with(getitem_node)
+                getitem_node.meta["val"] = old_getitem.meta["val"]
+
+            # Extract the appropriate meta value
+            # It is present even if the getitem node does not exist
+            meta_val[idx] = old_node.meta["val"][old_idx]
+
+        # Fix the meta value on the new fused node
+        fused_node.meta["val"] = tuple(meta_val)
+
+
+class RMSNormQuantPattern:
+
+    def __init__(self, epsilon: float, key: FusedRMSQuantKey):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
+
+        assert key.quant in QUANT_OPS, \
+            f"unsupported quantization scheme {key.quant}"
+        self.QUANT_OP = QUANT_OPS[key.quant]
+
+        assert key in FUSED_OPS, \
+            f"unsupported fused rmsnorm+quant op for {key}"
+        self.FUSED_OP = FUSED_OPS[key]
+
+
+class RMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        fused_key = FusedRMSQuantKey(fused_add=False,
+                                     quant=QuantKey(dtype=quant_dtype,
+                                                    static=True,
+                                                    per_tensor=True,
+                                                    symmetric=symmetric))
+        super().__init__(epsilon, fused_key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        # Cannot use methods, as the self argument affects tracing
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale)
+
+            # result
+            return at2[1]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result
+            return at[1]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only,
+                                pm_pass)
+
+
+class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=True,
+                                              per_tensor=True,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale)
+
+            # result, residual
+            return at1[1], at[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result, residual
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 1
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and residual.
+            # The auto_fn node returns a tuple of (None, result, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # residual_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                # 0 is always None
+                fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)}
+                self.insert_fused_node(fused_return_mapping,
+                                       epsilon=rms_node.kwargs["epsilon"],
+                                       **kwargs)
+
+
+class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=False,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, scale
+            return at2[1], at2[2]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=None)
+
+            # result, scale
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
 
+    class Match(QuantMultiOutputMatch):
 
-# Returns the first auto_functionalized node with the given op (if it exists)
-def find_auto_fn_maybe(nodes: Iterable[torch.fx.Node],
-                       op) -> Optional[torch.fx.Node]:
-    for node in nodes:
-        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
-            return node
-    return None
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
 
+            assert len(rms_node.users) == 1
+            assert len(quant_node.users) == 2
 
-# Returns the first auto_functionalized node with the given op
-def find_auto_fn(nodes: Iterable[torch.fx.Node], op) -> torch.fx.Node:
-    node = find_auto_fn_maybe(nodes, op)
-    assert node is not None, f"Could not find {op} in nodes {nodes}"
-    return node
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and scale.
+            # The auto_fn node returns a tuple of (None, result, scale).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+                del kwargs["result_rms"]  # not used in the fused op
+
+                fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)}
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    residual=None,  # not used but required
+                    **kwargs)
+
+
+class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool = True,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, residual, scale
+            return at1[1], at[2], at1[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=residual)
+
+            # result, residual, scale
+            return at[1], at[3], at[2]
 
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
 
-# Returns the getitem node that extracts the idx-th element from node
-# (if it exists)
-def find_getitem_maybe(node: torch.fx.Node,
-                       idx: int) -> Optional[torch.fx.Node]:
-    for user in node.users:
-        if is_func(user, operator.getitem) and user.args[1] == idx:
-            return user
-    return None
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
 
+    class Match(QuantMultiOutputMatch):
 
-# Returns the getitem node that extracts the idx-th element from node
-def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node:
-    ret = find_getitem_maybe(node, idx)
-    assert ret is not None, f"Could not find getitem {idx} in node {node}"
-    return ret
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 2
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract result, scale, and residual.
+            # The auto_fn node returns a tuple (None, result, scale, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            # residual_node_new = at[3]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                fused_return_mapping = {
+                    1: (quant_node, 1),  # result
+                    2: (quant_node, 2),  # scale
+                    3: (rms_node, 2),  # residual
+                }
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    **kwargs)
 
 
 class FusionPass(VllmInductorPass):
@@ -158,41 +546,39 @@ def __init__(self, config: CompilationConfig.PassConfig):
             "FusionPass singleton instance already exists"
         super().__init__(config)
 
-        self.matches: List[Match] = []
+        self.matches: List[MultiOutputMatch] = []
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="fusion_pass")
 
-        # Fuse rms_norm + static_scaled_fp8_quant into
-        # rms_norm_static_fp8_quant
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_static, rms_replacement_static,
-                             inputs, fwd_only, self.patterns)
+        for epsilon in [1e-5, 1e-6]:
+            # Fuse rms_norm + static fp8 quant
+            RMSNormStaticQuantPattern(epsilon,
+                                      FP8_DTYPE).register(self.patterns)
 
-        # Fuse fused_add_rms_norm + static_scaled_fp8_quant into
-        # fused_add_rms_norm_static_fp8_quant
-        # Because pattern has 2 outputs, we need to manually process the match
-        # (see process_matches)
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_residual_static,
-                             rms_replacement_residual_static,
-                             inputs,
-                             fwd_only,
-                             self.patterns,
-                             extra_check=lambda m: self.record_match(m))
-
-    def record_match(self, match: Match) -> bool:
+            # Matches for patterns below have 2 or more outputs,
+            # so we need to process them manually (see process_matches)
+
+            # Fuse rms_norm + static fp8 quant
+            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns, self.record_match)
+
+            # Fuse rms_norm + dynamic per-token fp8 quant
+            RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE,
+                                       per_tensor=False).register(
+                                           self.patterns, self.record_match)
+
+            # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
+            FusedAddRMSNormDynamicQuantPattern(epsilon,
+                                               FP8_DTYPE,
+                                               per_tensor=False).register(
+                                                   self.patterns,
+                                                   self.record_match)
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+    def record_match(self, match: MultiOutputMatch) -> bool:
         # Hijack the extra_check to record the match and
         # save it for post-processing.
         self.matches.append(match)
@@ -200,83 +586,20 @@ def record_match(self, match: Match) -> bool:
         # Return False to prevent automatic replacement.
         return False
 
-    def process_matches(self, graph: torch.fx.Graph):
+    def process_matches(self, graph: fx.Graph):
         """
         Manually process multi-output matches and replace them with fused nodes.
-        This is necessary because the automatic replacement for multi-output
-        matches is broken: https://github.com/pytorch/pytorch/issues/137280
+        See MultiOutputMatch for more details.
         """
         for match in self.matches:
-            # To avoid use-before-definition errors, insert replacement nodes
-            # after the last node in the match.
-            # match.nodes is not guaranteed to be sorted.
-            # Find the last node in the match.
-            for last_node_in_match in reversed(graph.nodes):
-                if last_node_in_match in match.nodes:
-                    break
-            else:
-                raise ValueError("No nodes in graph")
-
-            # Insert a new auto_functionalized node for the fused operation,
-            # as well as getitem nodes to extract the result and residual.
-            # The auto_functionalized node returns a tuple of
-            # (None, result, residual) - None is the function return value.
-            # The resulting graph looks like this:
-            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
-            # result_node_new = at[1]
-            # residual_node_new = at[2]
-            with graph.inserting_after(last_node_in_match):
-                kwargs = match.kwargs
-                kwargs["epsilon"] = 1e-5  # Currently hard-coded in RMSNorm
-
-                fused_node = graph.call_function(
-                    auto_functionalized,
-                    (torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-                     ),
-                    kwargs=kwargs)
-
-                graph.inserting_after(fused_node)
-                result_node_new = graph.call_function(operator.getitem,
-                                                      (fused_node, 1))
-                residual_node_new = graph.call_function(
-                    operator.getitem, (fused_node, 2))
-
-            # Last part of replacement is rebinding the users of nodes in the
-            # match to use the new nodes.
-
-            # Find the nodes in the match that we need to rebind
-            rms_node = find_auto_fn(match.nodes,
-                                    torch.ops._C.fused_add_rms_norm.default)
-            quant_node = find_auto_fn(
-                match.nodes, torch.ops._C.static_scaled_fp8_quant.default)
-
-            assert len(rms_node.users) == 2
-            assert len(quant_node.users) == 1
-
-            # meta["val"] is used by de-functionalization and has to contain the
-            # value of the node (tuple of tensors) that would be returned by the
-            # functionalized node during tracing.
-
-            rms_tup = rms_node.meta["val"]
-            quant_tup = quant_node.meta["val"]
-
-            # The result of fused_node must be a tuple with the first element
-            # None (the function return value) and the remaining elements
-            # representing the mutated inputs.
-            fused_tup = (None, quant_tup[1], rms_tup[1], rms_tup[2])
-            fused_node.meta["val"] = fused_tup
-
-            # Find the getitem nodes and replace their uses with the new nodes.
-            # The old nodes will be removed by DCE at the end of the pass.
-            find_getitem(rms_node, 2).replace_all_uses_with(residual_node_new)
-            find_getitem(quant_node, 1).replace_all_uses_with(result_node_new)
+            match.process()
 
         # Finally, remove matched nodes
         graph.eliminate_dead_code()
         assert all(node not in graph.nodes for match in self.matches
-                   for node in match.nodes)
+                   for node in match.match.nodes)
 
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: fx.Graph):
         self.begin()
         self.dump_graph(graph, "before_fusion")
 
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
new file mode 100644
index 0000000000000..924e26f2e262e
--- /dev/null
+++ b/vllm/compilation/fx_utils.py
@@ -0,0 +1,42 @@
+import operator
+from typing import Iterable, Optional
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._ops import OpOverload
+
+
+def is_func(node: fx.Node, target) -> bool:
+    return node.op == "call_function" and node.target == target
+
+
+# Returns the first auto_functionalized node with the given op (if it exists)
+def find_auto_fn_maybe(nodes: Iterable[fx.Node],
+                       op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
+            return node
+    return None
+
+
+# Returns the first auto_functionalized node with the given op
+def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_auto_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the getitem node that extracts the idx-th element from node
+# (if it exists)
+def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+    for user in node.users:
+        if is_func(user, operator.getitem) and user.args[1] == idx:
+            return user
+    return None
+
+
+# Returns the getitem node that extracts the idx-th element from node
+def find_getitem(node: fx.Node, idx: int) -> fx.Node:
+    ret = find_getitem_maybe(node, idx)
+    assert ret is not None, f"Could not find getitem {idx} in node {node}"
+    return ret
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
new file mode 100644
index 0000000000000..0ad648abfbb3a
--- /dev/null
+++ b/vllm/compilation/multi_output_match.py
@@ -0,0 +1,105 @@
+import abc
+import operator
+from abc import abstractmethod
+from typing import Iterable, List, Tuple
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor import pattern_matcher as pm
+from torch._ops import OpOverload
+
+from vllm.compilation.fx_utils import find_auto_fn
+
+
+class MultiOutputMatch(abc.ABC):
+    """
+    This class provides utilities to process multi-output matches and
+    manually insert replacements.
+
+    This is necessary because the automatic replacement for multi-output
+    matches is broken: https://github.com/pytorch/pytorch/issues/137280
+    """
+
+    def __init__(self, match: pm.Match):
+        self.match = match
+
+    @abstractmethod
+    def process(self):
+        """
+        Process a multi-output match and manually insert the replacement.
+
+        This method should:
+        1. Insert the replacement nodes after the last node in the match.
+        2. Rebind the users of nodes in the match to use the new nodes.
+        3. Set meta["val"] for de-functionalization.
+
+        The result of an auto-functionalized node is a tuple of tensors.
+        The first element is the return value of the function, usually None.
+        The remaining elements are the mutated args of the function.
+
+        All auto-functionalized nodes must contain a proper meta["val"],
+        as it is used by de-functionalization. meta["val"] has to contain the
+        value of the node (tuple of tensors) that would be returned by the
+        functionalized node during tracing.
+
+        Existing nodes in the graph all have this property set, but we have
+        to set it manually for new nodes we insert.
+
+        Example:
+        # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None
+        at = auto_functionalized(torch.ops._C.foo.default, a, b, c)
+        # at.meta["val"] = (None, a, c)
+        """
+        raise NotImplementedError
+
+    @property
+    def nodes(self) -> List[fx.Node]:
+        return self.match.nodes
+
+    @property
+    def graph(self) -> fx.Graph:
+        return self.match.graph
+
+    def find_auto_fn(self, op) -> fx.Node:
+        """
+        Find the first auto_functionalized node with the given op in the match.
+        """
+        return find_auto_fn(self.nodes, op)
+
+    def inserting_after_match(self):
+        """
+        Insert nodes after the last node in the match.
+        This is done to avoid use-before-definition errors after inserting
+        replacement nodes.
+        """
+
+        # match.nodes is not guaranteed to be sorted.
+        # Find the last node in the match.
+        for last_node_in_match in reversed(self.graph.nodes):
+            if last_node_in_match in self.match.nodes:
+                break
+        else:
+            raise ValueError("No nodes in graph")
+
+        return self.graph.inserting_after(last_node_in_match)
+
+    def insert_getitems(self, tuple_node: fx.Node,
+                        indices: Iterable[int]) -> Tuple[fx.Node, ...]:
+        """
+        Insert operator.getitem nodes to extract elements from a tuple node.
+
+        :param tuple_node: The tuple node to extract elements from.
+        :param indices: The indices of the elements to extract.
+        :return: Tuple of the new getitem nodes, corresponding to the indices.
+        """
+        with self.graph.inserting_after(tuple_node):
+            return tuple(
+                self.graph.call_function(operator.getitem, (tuple_node, idx))
+                for idx in indices)
+
+    def insert_auto_fn(self, op: OpOverload, kwargs):
+        """
+        Insert an auto_functionalized node with the given op and kwargs.
+        """
+        return self.graph.call_function(auto_functionalized, (op, ),
+                                        kwargs=kwargs)
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
index 63a369fe8d966..ba28b1f0be7bd 100644
--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
@@ -5,7 +5,8 @@
 
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index dbf6b8f7789e1..b8c52a7f46838 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -16,10 +16,6 @@
 logger = init_logger(__name__)
 
 
-def is_func(node: torch.fx.Node, target) -> bool:
-    return node.op == "call_function" and node.target == target
-
-
 class VllmInductorPass(InductorPass):
     """
     An inductor pass with access to vLLM PassConfig.

From 1efce686053c15cd6f84361bb0bd1898fbb23a82 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Thu, 12 Dec 2024 20:09:53 -0800
Subject: [PATCH 25/85] [Bugfix] Use runner_type instead of task in GritLM
 (#11144)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 tests/models/embedding/language/test_gritlm.py | 6 +++---
 vllm/model_executor/models/gritlm.py           | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index b947265be9e9d..55c2e5d4ed412 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -35,7 +35,7 @@ def test_find_array(monkeypatch):
     from vllm.model_executor.models.gritlm import GritLMPooler
 
     # Create an LLM object to get the model config.
-    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
     pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
 
     arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
@@ -55,7 +55,7 @@ def server_embedding():
     with pytest.MonkeyPatch.context() as mp:
         mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
 
-        args = ["--task", "embedding", "--max_model_len", str(MAX_MODEL_LEN)]
+        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
         with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
             yield remote_server
 
@@ -141,7 +141,7 @@ def test_gritlm_offline_embedding(monkeypatch):
 
     queries, q_instruction, documents, d_instruction = get_test_data()
 
-    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
 
     d_rep = run_llm_encode(
         llm,
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index ec01a07c16a62..34c1332ac4a66 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -203,12 +203,12 @@ def __init__(
     ) -> None:
         super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
 
-        self.task = vllm_config.model_config.task
+        self.runner_type = vllm_config.model_config.runner_type
 
         self._pooler = GritLMPooler(vllm_config.model_config)
 
         for layer in self.model.layers:
-            if self.task == "embedding" and hasattr(layer, "self_attn"):
+            if self.runner_type == "pooling" and hasattr(layer, "self_attn"):
                 assert isinstance(layer.self_attn.attn.impl, XFormersImpl), (
                     "GritLM embedding is only supported by XFormers backend, "
                     "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS")
@@ -222,8 +222,8 @@ def forward(
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
 
-        # Change attention to non-causal for embedding task.
-        if self.task == "embedding":
+        # Change attention to non-causal for pooling tasks.
+        if self.runner_type == "pooling":
             assert attn_metadata.prefill_metadata.attn_bias is None
             attn_metadata.prefill_metadata.attn_bias = [
                 BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)

From 3989a798249bfa24b6dd22aff599796fcf92dce9 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 13 Dec 2024 00:07:20 -0500
Subject: [PATCH 26/85] [Bugfix] Update starcoder2 to remap k/v scale names for
 kv_cache quantization (#11148)

---
 vllm/model_executor/models/starcoder2.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 15e8f2af52cda..22189a517d313 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -37,7 +37,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -345,6 +346,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 if self.config.tie_word_embeddings and "lm_head.weight" in name:
                     continue
                 if is_pp_missing_parameter(name, self):

From 00c1bde5d8cd30b14f661b11d9ad1c1d4470ddbf Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 13 Dec 2024 00:31:26 -0500
Subject: [PATCH 27/85] [ROCm][AMD] Disable auto enabling chunked prefill on
 ROCm (#11146)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/engine/arg_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0c28fe7032728..0098648b1cd60 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1074,7 +1074,8 @@ def create_engine_config(self,
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and model_config.runner_type != "pooling"):
+                        and model_config.runner_type != "pooling"
+                        and not current_platform.is_rocm()):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "

From 34f1a806d5771c4ee81fdaf4feb7f9fd4071d779 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 12 Dec 2024 22:30:06 -0800
Subject: [PATCH 28/85] [Bugfix][V1] Fix 'NoneType' object has no attribute
 'hash_value' (#11157)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/v1/core/kv_cache_manager.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 03cbb958237df..8044481a9cd6a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -164,13 +164,14 @@ def append_slots(
 
         new_full_blocks = req_blocks[
             num_computed_full_blocks:num_full_blocks_after_append]
-        self._cache_full_blocks(
-            request=request,
-            blk_start_idx=num_computed_full_blocks,
-            full_blocks=new_full_blocks,
-            prev_block=req_blocks[num_computed_full_blocks - 1]
-            if num_computed_full_blocks >= 1 else None,
-        )
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=num_computed_full_blocks,
+                full_blocks=new_full_blocks,
+                prev_block=req_blocks[num_computed_full_blocks - 1]
+                if num_computed_full_blocks >= 1 else None,
+            )
 
         return new_blocks
 
@@ -375,8 +376,13 @@ def _cache_full_blocks(
             prev_block: The previous block in the chain.
         """
         # Update the new blocks with the block hashes through the chain.
-        prev_block_hash_value = (prev_block.block_hash.hash_value
-                                 if prev_block is not None else None)
+        prev_block_hash_value = None
+        if prev_block is not None:
+            # Previous block must have a block hash because it must be
+            # a full, cached block.
+            assert prev_block.block_hash is not None
+            prev_block_hash_value = prev_block.block_hash.hash_value
+
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 

From be39e3cd18781c4571410323f3c767e67240eb51 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 12 Dec 2024 22:57:50 -0800
Subject: [PATCH 29/85] [core] clean up cudagraph batchsize padding logic
 (#10996)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../decoder_only/language/test_jamba.py       |   5 +-
 .../decoder_only/language/test_mamba.py       |   5 +-
 .../test_encoder_decoder_model_runner.py      |   4 +-
 tests/worker/test_model_runner.py             |   4 +-
 vllm/config.py                                | 171 +++++++++++-------
 vllm/model_executor/models/jamba.py           |  20 +-
 vllm/model_executor/models/mamba.py           |  21 ++-
 vllm/v1/worker/gpu_model_runner.py            |  11 +-
 vllm/worker/enc_dec_model_runner.py           |   2 +-
 vllm/worker/model_runner.py                   |   7 +-
 vllm/worker/xpu_model_runner.py               |   4 -
 11 files changed, 150 insertions(+), 104 deletions(-)

diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index cae25ae9fa2c8..057b04349e8b7 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,7 +1,7 @@
 import pytest
 
 from tests.utils import multi_gpu_test
-from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
 from ...utils import check_outputs_equal
@@ -189,7 +189,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 35018c3c14dee..06739e8f02253 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -5,7 +5,7 @@
 import pytest
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
 from ...utils import check_outputs_equal
@@ -200,7 +200,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 5289c91f201cd..a6b3cb5759f2b 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -4,7 +4,6 @@
 import pytest
 import torch
 
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -548,7 +547,8 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     # With CUDA Graph capture and replay enabled, the decoder and encoder
     # input sequences will be padded. Create the expected padded tensors
     # accordingly.
-    graph_batch_size = VllmConfig.get_graph_batch_size(expanded_batch_size)
+    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
+        expanded_batch_size)
     cuda_graph_pad_size = graph_batch_size - expanded_batch_size
     padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
     padded_encoder_seq_lens = encoder_seq_lens + list(
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 4055524f3e0c7..aabe913c242e1 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -3,7 +3,6 @@
 import pytest
 import torch
 
-from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
                                              init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
@@ -177,7 +176,8 @@ def test_prepare_decode_cuda_graph(batch_size):
         model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
-    expected_bs = VllmConfig.get_graph_batch_size(len(seq_group_metadata_list))
+    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
+        len(seq_group_metadata_list))
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.num_prefills == 0
diff --git a/vllm/config.py b/vllm/config.py
index 08a7b607630af..12ed80c366e43 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2354,6 +2354,12 @@ def model_post_init(self, __context: Any) -> None:
     # not configurable, computed after init
     compile_sizes: List[int] = PrivateAttr
     capture_sizes: List[int] = PrivateAttr
+    max_capture_size: int = PrivateAttr
+    # optimization:
+    # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
+    # since we know all keys are in a range [0, max_capture_size],
+    # we can optimize it to List[int] for better lookup performance.
+    bs_to_padded_graph_size: List[int] = PrivateAttr
 
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
@@ -2365,6 +2371,19 @@ def model_post_init(self, __context: Any) -> None:
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
+    def __repr__(self) -> str:
+        exclude = {
+            "static_forward_context",
+            "enabled_custom_ops",
+            "disabled_custom_ops",
+            "compilation_time",
+            "bs_to_padded_graph_size",
+            "pass_config",
+        }
+        return self.model_dump_json(exclude=exclude, exclude_unset=True)
+
+    __str__ = __repr__
+
     @classmethod
     def from_cli(cls, cli_value: str) -> "CompilationConfig":
         """Parse the CLI value for the compilation config."""
@@ -2450,18 +2469,22 @@ def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
 
         # sort to make sure cudagraph capture sizes are in descending order
         self.capture_sizes.sort(reverse=True)
+        self.max_capture_size = self.capture_sizes[
+            0] if self.capture_sizes else 0
 
-
-_BATCH_SIZE_ALIGNMENT = 8
-# all the token sizes that **can** be captured by cudagraph.
-# they can be arbitrarily large.
-# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192.
-# the actual sizes to capture will be determined by the model,
-# depending on the model's max_num_seqs.
-# NOTE: get_graph_batch_size needs to be updated if this list is changed.
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)
-]
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_capture_size + 1)
+        ]
+        for end, start in zip(self.capture_sizes,
+                              self.capture_sizes[1:] + [0]):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
+        self.bs_to_padded_graph_size[
+            self.max_capture_size] = self.max_capture_size
 
 
 @dataclass
@@ -2491,40 +2514,12 @@ class VllmConfig:
                                                  init=True)  # type: ignore
     instance_id: str = ""
 
-    @staticmethod
-    def get_graph_batch_size(batch_size: int) -> int:
-        """Returns the padded batch size given actual batch size.
-
-        Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
-        2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
-        """
-        if batch_size <= 2:
-            return batch_size
-        elif batch_size <= 4:
-            return 4
-        else:
-            return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
-                    _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-
-    @staticmethod
-    def get_max_graph_batch_size(max_num_seqs: int) -> int:
-        """
-        max_num_seqs: Maximum number of sequences in a batch.
-        _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture.
-
-        pad the max_num_seqs if necessary by calling get_graph_batch_size,
-        which will deal with some edge cases like 1, 2, 4.
-
-        if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded
-        size. if not, it means the padded size is larger than the largest size
-        in _BATCH_SIZES_TO_CAPTURE, return the largest size in
-        _BATCH_SIZES_TO_CAPTURE.
-        """
-        padded_size = VllmConfig.get_graph_batch_size(max_num_seqs)
-        if padded_size in _BATCH_SIZES_TO_CAPTURE:
-            return padded_size
-        assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-        return _BATCH_SIZES_TO_CAPTURE[-1]
+    def pad_for_cudagraph(self, batch_size: int) -> int:
+        # if batch_size > self.compilation_config.max_capture_size,
+        # it should raise an IndexError.
+        # the caller should make sure the batch_size is within the range,
+        # i.e., batch_size <= self.compilation_config.max_capture_size
+        return self.compilation_config.bs_to_padded_graph_size[batch_size]
 
     @staticmethod
     def _get_quantization_config(
@@ -2618,27 +2613,7 @@ def __post_init__(self):
             self.compilation_config.pass_config.enable_reshape = False
             self.compilation_config.level = CompilationLevel.PIECEWISE
 
-        if not envs.VLLM_USE_V1:
-            max_batchsize_to_capture = 0
-            if self.scheduler_config is not None and \
-                self.model_config is not None and \
-                    not self.model_config.enforce_eager:
-                max_batchsize_to_capture = \
-                    self.get_max_graph_batch_size(
-                    self.scheduler_config.max_num_seqs)
-            batch_size_capture_list = [
-                size for size in _BATCH_SIZES_TO_CAPTURE
-                if size <= max_batchsize_to_capture
-            ]
-        else:
-            batch_size_capture_list = []
-            if self.model_config is not None and \
-                not self.model_config.enforce_eager:
-                batch_size_capture_list = [1, 2, 4
-                                           ] + [i for i in range(8, 513, 8)]
-
-        self.compilation_config.init_with_cudagraph_sizes(
-            batch_size_capture_list)
+        self._set_cudagraph_sizes()
 
         if self.cache_config is not None and \
             self.cache_config.cpu_offload_gb > 0 and \
@@ -2659,6 +2634,70 @@ def __post_init__(self):
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+    def _set_cudagraph_sizes(self):
+        """
+        cudagraph batchsize padding logic:
+
+        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
+        batch sizes that cudagraph will capture.
+
+        Depending on the engine's configuration of `max_num_seqs`, the
+        candidate batch sizes to capture cudagraph will shrink to the subset
+        which just cover the range of `[1, max_num_seqs]`. In the common case,
+        `max_num_seqs` is 256, and the cudagraph batch sizes will be
+        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
+
+        However, if users specify the cudagraph capture sizes through
+        compilation config, we will use the specified sizes instead.
+
+        In the end, `vllm_config.compilation_config.capture_sizes` will be the
+        final sizes to capture cudagraph (in descending order).
+
+        During runtime, if batchsize is larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        no cudagraph will be used.
+        If the batch size is no larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        we can quickly find the padded graph size for a given batch size by
+        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
+        """
+
+        # calculate the default `batch_size_capture_list`
+        if not envs.VLLM_USE_V1:
+            batch_size_capture_list = []
+            max_batchsize_to_capture = 0
+            if self.scheduler_config is not None and \
+                self.model_config is not None and \
+                    not self.model_config.enforce_eager:
+
+                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
+                # find the minimum size that is larger than max_num_seqs,
+                # which then becomes the max_batchsize_to_capture
+                larger_sizes = [
+                    x for x in possible_sizes
+                    if x >= self.scheduler_config.max_num_seqs
+                ]
+                if larger_sizes:
+                    max_batchsize_to_capture = larger_sizes[0]
+                else:
+                    max_batchsize_to_capture = possible_sizes[-1]
+
+                # filter out the sizes that are
+                # larger than max_batchsize_to_capture
+                batch_size_capture_list = [
+                    size for size in possible_sizes
+                    if size <= max_batchsize_to_capture
+                ]
+        else:
+            batch_size_capture_list = []
+            if self.model_config is not None and \
+                not self.model_config.enforce_eager:
+                batch_size_capture_list = [1, 2, 4
+                                           ] + [i for i in range(8, 513, 8)]
+
+        self.compilation_config.init_with_cudagraph_sizes(
+            batch_size_capture_list)
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r},"
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6bb4c13ab35df..831db2ae52d74 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -7,7 +7,7 @@
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -420,6 +420,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -433,15 +444,12 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (VllmConfig.get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
 
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
-                *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
         (
             mamba_cache_tensors,
             state_indices_tensor,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 1f5cd02711899..06c8d9723cd01 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -6,7 +6,7 @@
 from transformers import MambaConfig
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -195,6 +195,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.backbone.get_input_embeddings(input_ids)
@@ -208,15 +219,11 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (VllmConfig.get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
-
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
-                *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
 
         (
             mamba_cache_tensors,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aa91255e68d48..f24942068d1f8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -459,7 +459,7 @@ def execute_model(
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
             # Use piecewise CUDA graphs.
             # Add padding to the batch size.
-            num_input_tokens = self._get_padded_batch_size(
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 num_scheduled_tokens)
         else:
             # Eager mode.
@@ -641,10 +641,3 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
-
-    def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
-        # TODO: Optimize this?
-        for size in self.cudagraph_batch_sizes:
-            if batch_size <= size:
-                return size
-        return None
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 5697fbbaa2041..bff01320d7927 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -464,7 +464,7 @@ def _prepare_encoder_model_input_tensors(
                 # We will be using CUDA graph replay for this decode.
                 max_len_of_block_table = self.get_max_block_per_batch()
                 batch_size = len(encoder_seq_lens)
-                graph_batch_size = self.vllm_config.get_graph_batch_size(
+                graph_batch_size = self.vllm_config.pad_for_cudagraph(
                     batch_size)
                 assert graph_batch_size >= batch_size
                 cuda_graph_pad_size = graph_batch_size - batch_size
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 26fd486130ce6..6ff98a8f1bab2 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -802,7 +802,8 @@ def _get_cuda_graph_pad_size(self,
                                         max_encoder_seq_len):
             return -1
 
-        graph_batch_size = VllmConfig.get_graph_batch_size(batch_size)
+        graph_batch_size = self.runner.vllm_config.pad_for_cudagraph(
+            batch_size)
         assert graph_batch_size >= batch_size
         return graph_batch_size - batch_size
 
@@ -1014,8 +1015,8 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size(
-            self.scheduler_config.max_num_seqs)
+        self.max_batchsize_to_capture = \
+            self.vllm_config.compilation_config.max_capture_size
 
         self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
             {} for _ in range(self.parallel_config.pipeline_parallel_size)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e6322e095bbb9..9cf25387560da 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -37,10 +37,6 @@
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = -1
-_BATCH_SIZE_ALIGNMENT = 8
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
-]
 
 TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU")
 

From 7cd7409142ff97aee1a13568753db9263fcf8f6b Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Fri, 13 Dec 2024 09:40:07 +0200
Subject: [PATCH 30/85] PaliGemma 2 support (#11142)

---
 docs/source/models/supported_models.rst       |  4 ++--
 examples/offline_inference_vision_language.py | 13 +++++++++++++
 vllm/model_executor/models/paligemma.py       | 11 ++++++++++-
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 35aa3bfdd12b7..cae4a88de1638 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -664,9 +664,9 @@ Text Generation (``--task generate``)
     - ✅︎
     - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
-    - PaliGemma
+    - PaliGemma, PaliGemma 2
     - T + I\ :sup:`E`
-    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
+    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
     - 
     - ✅︎
     - 
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 5e210126dc8fe..c430f42fdc814 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -137,6 +137,18 @@ def run_paligemma(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# PaliGemma 2
+def run_paligemma2(question: str, modality: str):
+    assert modality == "image"
+
+    # PaliGemma 2 has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Chameleon
 def run_chameleon(question: str, modality: str):
     assert modality == "image"
@@ -473,6 +485,7 @@ def run_mantis(question: str, modality: str):
     "fuyu": run_fuyu,
     "phi3_v": run_phi3v,
     "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 253e689e50a3b..f9ad0c67adaba 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -105,6 +105,11 @@ def input_processor_for_paligemma(ctx: InputContext,
         orig_prompt_ids.remove(hf_config.image_token_index)
 
     new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+
+    # The PaliGemma 2 tokenizer does not include a starting BOS token
+    if orig_prompt_ids[0] != hf_config.bos_token_id:
+        orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids
+
     new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
 
     # NOTE: Create a defensive copy of the original inputs
@@ -149,7 +154,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             projection_dim=config.vision_config.projection_dim)
 
         self.quant_config = quant_config
-        config.text_config.architectures = ["GemmaForCausalLM"]
+
+        if config.text_config.model_type == "gemma":
+            config.text_config.architectures = ["GemmaForCausalLM"]
+        else:
+            config.text_config.architectures = ["Gemma2ForCausalLM"]
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,

From f93bf2b1897cca5b644fe03f31925e4faff40056 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 13 Dec 2024 16:50:35 +0800
Subject: [PATCH 31/85] [Bugfix][CI][CPU] add missing datasets package to
 requirements-cpu.txt  (#11159)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 requirements-cpu.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index db8ad9d3a015d..e62f313297762 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,4 +4,5 @@
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
 torch==2.5.1; platform_machine == "aarch64"
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+datasets # for benchmark scripts
\ No newline at end of file

From eeec9e339005d887e0064f7b3e7771295ecd68e7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 13 Dec 2024 18:40:07 +0800
Subject: [PATCH 32/85] [Frontend] Separate pooling APIs in offline inference
 (#11129)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   7 +-
 docs/source/models/pooling_models.rst         |  53 +++-
 examples/offline_inference_classification.py  |  28 ++
 examples/offline_inference_embedding.py       |  16 +-
 examples/offline_inference_scoring.py         |  23 ++
 ...ine_inference_vision_language_embedding.py |   2 +-
 tests/conftest.py                             |  18 +-
 tests/entrypoints/openai/test_score.py        |  10 +-
 .../models/embedding/language/test_scoring.py |  10 +-
 tests/models/test_oot_registration.py         |   5 +-
 vllm/__init__.py                              |  36 +--
 vllm/engine/llm_engine.py                     |  17 +-
 vllm/entrypoints/llm.py                       | 143 ++++++++-
 vllm/entrypoints/openai/protocol.py           |   2 +-
 vllm/entrypoints/openai/serving_embedding.py  |   9 +-
 vllm/entrypoints/openai/serving_engine.py     |  12 +-
 vllm/entrypoints/openai/serving_score.py      |  12 +-
 vllm/model_executor/layers/pooler.py          | 288 ++++++++++++------
 vllm/model_executor/models/gritlm.py          |  15 +-
 vllm/outputs.py                               | 207 ++++++++-----
 vllm/sequence.py                              |  40 ++-
 21 files changed, 659 insertions(+), 294 deletions(-)
 create mode 100644 examples/offline_inference_classification.py
 create mode 100644 examples/offline_inference_scoring.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6a6ee3cf713ae..97aae233db105 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -181,14 +181,14 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
-- label: Examples Test # 15min
+- label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/entrypoints
   - examples/
   commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+    - pip install tensorizer # for tensorizer test
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_chat.py
@@ -198,6 +198,9 @@ steps:
     - python3 offline_inference_vision_language_multi_image.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
+    - python3 offline_inference_classification.py
+    - python3 offline_inference_embedding.py
+    - python3 offline_inference_scoring.py
     - python3 offline_profile.py --model facebook/opt-125m
 
 - label: Prefix Caching Test # 9min
diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
index 7fa66274c3c5a..94475c5e6689d 100644
--- a/docs/source/models/pooling_models.rst
+++ b/docs/source/models/pooling_models.rst
@@ -6,7 +6,7 @@ Pooling Models
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
 In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface.
-These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input
+These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
 
 .. note::
@@ -45,20 +45,48 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ^^^^^^^^^^^^^^
 
 The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM.
-It returns the aggregated hidden states directly.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+.. code-block:: python
+
+    llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+    output, = llm.encode("Hello, my name is")
+
+    data = output.outputs.data
+    print(f"Prompt: {prompt!r} | Data: {data!r}")
+
+``LLM.embed``
+^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
 
 .. code-block:: python
 
     llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-    outputs = llm.encode("Hello, my name is")
+    output, = llm.embed("Hello, my name is")
 
-    outputs = model.encode(prompts)
-    for output in outputs:
-        embeddings = output.outputs.embedding
-        print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}")
+    embeds = output.outputs.embedding
+    print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 
 A code example can be found in `examples/offline_inference_embedding.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py>`_.
 
+``LLM.classify``
+^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+.. code-block:: python
+
+    llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+    output, = llm.classify("Hello, my name is")
+
+    probs = output.outputs.probs
+    print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+
+A code example can be found in `examples/offline_inference_classification.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py>`_.
+
 ``LLM.score``
 ^^^^^^^^^^^^^
 
@@ -71,7 +99,16 @@ These types of models serve as rerankers between candidate query-document pairs
     vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
     To handle RAG at a higher level, you should use integration frameworks such as `LangChain <https://github.com/langchain-ai/langchain>`_.
 
-You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/models/embedding/language/test_scoring.py>`_ as reference.
+.. code-block:: python
+
+    llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+    output, = llm.score("What is the capital of France?",
+                        "The capital of Brazil is Brasilia.")
+
+    score = output.outputs.score
+    print(f"Score: {score}")
+
+A code example can be found in `examples/offline_inference_scoring.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py>`_.
 
 Online Inference
 ----------------
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference_classification.py
new file mode 100644
index 0000000000000..de539b639a196
--- /dev/null
+++ b/examples/offline_inference_classification.py
@@ -0,0 +1,28 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM.
+# You should pass task="classify" for classification models
+model = LLM(
+    model="jason9693/Qwen2.5-1.5B-apeach",
+    task="classify",
+    enforce_eager=True,
+)
+
+# Generate logits. The output is a list of ClassificationRequestOutputs.
+outputs = model.classify(prompts)
+
+# Print the outputs.
+for prompt, output in zip(prompts, outputs):
+    probs = output.outputs.probs
+    probs_trimmed = ((str(probs[:16])[:-1] +
+                      ", ...]") if len(probs) > 16 else probs)
+    print(f"Prompt: {prompt!r} | "
+          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index 17f6d992073d7..58d004313ad51 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -9,14 +9,20 @@
 ]
 
 # Create an LLM.
+# You should pass task="embed" for embedding models
 model = LLM(
     model="intfloat/e5-mistral-7b-instruct",
-    task="embed",  # You should pass task="embed" for embedding models
+    task="embed",
     enforce_eager=True,
 )
 
-# Generate embedding. The output is a list of PoolingRequestOutputs.
-outputs = model.encode(prompts)
+# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+outputs = model.embed(prompts)
+
 # Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 4096 floats
+for prompt, output in zip(prompts, outputs):
+    embeds = output.outputs.embedding
+    embeds_trimmed = ((str(embeds[:16])[:-1] +
+                       ", ...]") if len(embeds) > 16 else embeds)
+    print(f"Prompt: {prompt!r} | "
+          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference_scoring.py
new file mode 100644
index 0000000000000..5da9e710959b5
--- /dev/null
+++ b/examples/offline_inference_scoring.py
@@ -0,0 +1,23 @@
+from vllm import LLM
+
+# Sample prompts.
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+]
+
+# Create an LLM.
+# You should pass task="score" for cross-encoder models
+model = LLM(
+    model="BAAI/bge-reranker-v2-m3",
+    task="score",
+    enforce_eager=True,
+)
+
+# Generate scores. The output is a list of ScoringRequestOutputs.
+outputs = model.score(text_1, texts_2)
+
+# Print the outputs.
+for text_2, output in zip(texts_2, outputs):
+    score = output.outputs.score
+    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index bf466109f0981..4ce3d496bf45b 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -133,7 +133,7 @@ def run_encode(model: str, modality: QueryModality):
     if req_data.image is not None:
         mm_data["image"] = req_data.image
 
-    outputs = req_data.llm.encode({
+    outputs = req_data.llm.embed({
         "prompt": req_data.prompt,
         "multi_modal_data": mm_data,
     })
diff --git a/tests/conftest.py b/tests/conftest.py
index 7606e0f11dfeb..4e939221329cd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -719,14 +719,6 @@ def get_inputs(
 
         return inputs
 
-    def classify(self, prompts: List[str]) -> List[str]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
-
     def generate(
         self,
         prompts: List[str],
@@ -897,6 +889,10 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
+    def classify(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
     def encode(
         self,
         prompts: List[str],
@@ -909,16 +905,16 @@ def encode(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.encode(inputs)
+        req_outputs = self.model.embed(inputs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def score(
         self,
         text_1: Union[str, List[str]],
         text_2: Union[str, List[str]],
-    ) -> List[List[float]]:
+    ) -> List[float]:
         req_outputs = self.model.score(text_1, text_2)
-        return [req_output.outputs.embedding for req_output in req_outputs]
+        return [req_output.outputs.score for req_output in req_outputs]
 
     def __enter__(self):
         return self
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 7565ff7192f67..0698c19ad0023 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -39,8 +39,8 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 2
-    assert score.data[0].score[0] <= 0.01
-    assert score.data[1].score[0] >= 0.9
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
 
 
 @pytest.mark.asyncio
@@ -67,8 +67,8 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 2
-    assert score.data[0].score[0] <= 0.01
-    assert score.data[1].score[0] >= 0.9
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
 
 
 @pytest.mark.asyncio
@@ -90,4 +90,4 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 1
-    assert score.data[0].score[0] >= 0.9
+    assert score.data[0].score >= 0.9
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index 0c3115d195fc1..af31e1a635f65 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -42,7 +42,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 1
     assert len(hf_outputs) == 1
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -63,8 +63,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -85,5 +85,5 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 94be215258f89..2c413a633896a 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm import LLM, PoolingParams, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
@@ -36,9 +36,8 @@ def test_oot_registration_text_generation(dummy_opt_path):
 def test_oot_registration_embedding(dummy_gemma2_embedding_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = PoolingParams()
     llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.encode(prompts, sampling_params)
+    outputs = llm.embed(prompts)
 
     for output in outputs:
         assert all(v == 0 for v in output.outputs.embedding)
diff --git a/vllm/__init__.py b/vllm/__init__.py
index a10f6d3128cb6..45252b93e3d54 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -7,8 +7,11 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, PoolingOutput,
-                          PoolingRequestOutput, RequestOutput)
+from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
+                          CompletionOutput, EmbeddingOutput,
+                          EmbeddingRequestOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput, ScoringOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
@@ -27,6 +30,12 @@
     "CompletionOutput",
     "PoolingOutput",
     "PoolingRequestOutput",
+    "EmbeddingOutput",
+    "EmbeddingRequestOutput",
+    "ClassificationOutput",
+    "ClassificationRequestOutput",
+    "ScoringOutput",
+    "ScoringRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",
@@ -34,26 +43,3 @@
     "initialize_ray_cluster",
     "PoolingParams",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "EmbeddingOutput":
-        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingOutput
-
-    if name == "EmbeddingRequestOutput":
-        msg = ("EmbeddingRequestOutput has been renamed to "
-               "PoolingRequestOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingRequestOutput
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d756f71e4fa53..dc2d77d6927cd 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -46,11 +46,10 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
-                           ParallelSampleSequenceGroup, Sequence,
-                           SequenceGroup, SequenceGroupBase,
-                           SequenceGroupMetadata, SequenceGroupOutput,
-                           SequenceStatus)
+from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
+                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
@@ -966,9 +965,9 @@ def has_unfinished_requests_for_virtual_engine(
     @staticmethod
     def _process_sequence_group_outputs(
         seq_group: SequenceGroup,
-        outputs: List[EmbeddingSequenceGroupOutput],
+        outputs: List[PoolingSequenceGroupOutput],
     ) -> None:
-        seq_group.embeddings = outputs[0].embeddings
+        seq_group.pooled_data = outputs[0].data
 
         for seq in seq_group.get_seqs():
             seq.status = SequenceStatus.FINISHED_STOPPED
@@ -1784,8 +1783,8 @@ def _get_stats(self,
                                num_prompt_tokens_iter)
         # Spec decode, if enabled, emits specialized metrics from the worker in
         # sampler output.
-        if model_output and (model_output[0].spec_decode_worker_metrics
-                             is not None):
+        if model_output and isinstance(model_output[0], SamplerOutput) and (
+                model_output[0].spec_decode_worker_metrics is not None):
             spec_decode_metrics = model_output[0].spec_decode_worker_metrics
         else:
             spec_decode_metrics = None
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 0bec978c4869c..11b2574ce42dd 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -26,7 +26,9 @@
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
     GuidedDecodingRequest, LLMGuidedOptions)
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput, RequestOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -120,7 +122,7 @@ class LLM:
         serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
     """
 
-    DEPRECATE_LEGACY: ClassVar[bool] = False
+    DEPRECATE_LEGACY: ClassVar[bool] = True
     """A flag to toggle whether to deprecate the legacy generate/encode API."""
 
     DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
@@ -257,11 +259,14 @@ def generate(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
-        *,
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
+        *,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -275,6 +280,9 @@ def generate(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -288,6 +296,9 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -302,6 +313,9 @@ def generate(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -316,6 +330,9 @@ def generate(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -328,6 +345,9 @@ def generate(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -678,11 +698,12 @@ def encode(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
-        *,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
+        *,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -696,6 +717,7 @@ def encode(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -709,6 +731,7 @@ def encode(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -723,6 +746,7 @@ def encode(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -737,6 +761,7 @@ def encode(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -749,6 +774,7 @@ def encode(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -768,7 +794,8 @@ def encode(
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
-        """Generates the completions for the input prompts.
+        """Apply pooling to the hidden states corresponding to the input
+        prompts.
 
         This class automatically batches the given prompts, considering
         the memory constraint. For the best performance, put all of your prompts
@@ -787,7 +814,7 @@ def encode(
 
         Returns:
             A list of ``PoolingRequestOutput`` objects containing the
-            generated embeddings in the same order as the input prompts.
+            pooled hidden states in the same order as the input prompts.
 
         Note:
             Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
@@ -833,28 +860,110 @@ def encode(
         return self.engine_class.validate_outputs(outputs,
                                                   PoolingRequestOutput)
 
+    def embed(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[EmbeddingRequestOutput]:
+        """
+        Generate an embedding vector for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``EmbeddingRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "embed":
+            raise ValueError(
+                "Embedding API is only enabled for `--task embed`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [EmbeddingRequestOutput.from_base(item) for item in items]
+
+    def classify(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ClassificationRequestOutput]:
+        """
+        Generate class logits for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``ClassificationRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "classify":
+            raise ValueError(
+                "Classification API is only enabled for `--task classify`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [ClassificationRequestOutput.from_base(item) for item in items]
+
     def score(
         self,
         text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
         text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]],
         /,
+        *,
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
-        """Generates similarity scores for all pairs <text,text_pair>.
+    ) -> List[ScoringRequestOutput]:
+        """Generate similarity scores for all pairs ``<text,text_pair>``.
 
-        The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case
-        the text_1 sentence will be replicated N times to pair with the text_2
-        sentences. The input pairs are used to build a list of prompts for the
+        The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
+        In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
+        times to pair with the ``text_2`` sentences.
+        The input pairs are used to build a list of prompts for the
         cross encoder model. This class automatically batches the prompts,
         considering the memory constraint. For the best performance, put all
         of your texts into a single list and pass it to this method.
 
         Args:
             text_1: can be a single prompt or a list of prompts, in which
-                case it has to have the same length as the text_2 list
+                case it has to have the same length as the ``text_2`` list
             text_2: The texts to pair with the query to form the input
                 to the LLM. See :class:`~vllm.inputs.PromptType` for
                 more details about the format of each prompts.
@@ -864,7 +973,7 @@ def score(
                 generation, if any.
 
         Returns:
-            A list of ``PoolingRequestOutput`` objects containing the
+            A list of ``ScoringRequestOutput`` objects containing the
             generated scores in the same order as the input prompts.
         """
         runner_type = self.llm_engine.model_config.runner_type
@@ -884,6 +993,8 @@ def score(
 
         if not self.llm_engine.model_config.is_cross_encoder:
             raise ValueError("Your model does not support cross encoding")
+        if self.llm_engine.model_config.task != "score":
+            raise ValueError("Score API is only enabled for `--task score`")
 
         tokenizer = self.llm_engine.get_tokenizer()
 
@@ -954,8 +1065,10 @@ def ensure_str(prompt: SingletonPrompt):
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
-        return self.engine_class.validate_outputs(outputs,
-                                                  PoolingRequestOutput)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ee94a9413f098..34c9f0a96216f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -900,7 +900,7 @@ class EmbeddingResponse(OpenAIBaseModel):
 class ScoreResponseData(OpenAIBaseModel):
     index: int
     object: str = "score"
-    score: Union[List[float], str]
+    score: float
 
 
 class ScoreResponse(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 3f7b75e893cad..fd501ad4f833e 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -18,14 +18,15 @@
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.logger import init_logger
-from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput)
 from vllm.utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
 
 def _get_embedding(
-    output: PoolingOutput,
+    output: EmbeddingOutput,
     encoding_format: Literal["float", "base64"],
 ) -> Union[List[float], str]:
     if encoding_format == "float":
@@ -46,8 +47,10 @@ def request_output_to_embedding_response(
     data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
+        embedding_res = EmbeddingRequestOutput.from_base(final_res)
         prompt_token_ids = final_res.prompt_token_ids
-        embedding = _get_embedding(final_res.outputs, encoding_format)
+
+        embedding = _get_embedding(embedding_res.outputs, encoding_format)
         embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
         data.append(embedding_data)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d5ad4354c78be..5b6a089e4c319 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -31,7 +31,7 @@
                                               ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
-                                              ModelPermission,
+                                              ModelPermission, ScoreRequest,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
                                               UnloadLoraAdapterRequest)
@@ -73,7 +73,7 @@ class LoRAModulePath:
 
 
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
-                              EmbeddingCompletionRequest,
+                              EmbeddingCompletionRequest, ScoreRequest,
                               TokenizeCompletionRequest]
 
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
@@ -567,12 +567,14 @@ async def _get_trace_headers(
         return None
 
     @staticmethod
-    def _base_request_id(raw_request: Request,
+    def _base_request_id(raw_request: Optional[Request],
                          default: Optional[str] = None) -> Optional[str]:
         """Pulls the request id to use from a header, if provided"""
         default = default or random_uuid()
-        return raw_request.headers.get(
-            "X-Request-Id", default) if raw_request is not None else default
+        if raw_request is None:
+            return default
+
+        return raw_request.headers.get("X-Request-Id", default)
 
     @staticmethod
     def _get_decoded_token(logprob: Logprob,
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 4929e720c00e4..6f5cc14ac37cc 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -13,7 +13,7 @@
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
-from vllm.outputs import PoolingRequestOutput
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators
 
@@ -24,13 +24,13 @@ def request_output_to_score_response(
         final_res_batch: List[PoolingRequestOutput], request_id: str,
         created_time: int, model_name: str) -> ScoreResponse:
     data: List[ScoreResponseData] = []
-    score = None
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
-        if final_res is not None:
-            score = final_res.outputs.embedding
-            score_data = ScoreResponseData(index=idx, score=score)
-            data.append(score_data)
+        classify_res = ScoringRequestOutput.from_base(final_res)
+
+        score_data = ScoreResponseData(index=idx,
+                                       score=classify_res.outputs.score)
+        data.append(score_data)
 
     usage = UsageInfo(
         prompt_tokens=num_prompt_tokens,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index e0d42e30ebef3..75bf33dc70a51 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,14 +1,16 @@
 from enum import IntEnum
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from transformers import PretrainedConfig
+from typing_extensions import assert_never
 
 from vllm.config import PoolerConfig
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
-from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
+from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
@@ -22,7 +24,7 @@ class PoolingType(IntEnum):
     MEAN = 4
 
 
-class Pooler(nn.Module):
+class SimplePooler(nn.Module):
     """A layer that pools specific information from hidden states.
 
     This layer does the following:
@@ -35,21 +37,203 @@ class Pooler(nn.Module):
         normalize: Whether to normalize the pooled data.
     """
 
+    @staticmethod
+    def from_pooling_type(
+        pooling_type: PoolingType,
+        *,
+        normalize: bool,
+        softmax: bool,
+        step_tag_id: Optional[int] = None,
+        returned_token_ids: Optional[List[int]] = None,
+    ) -> "SimplePooler":
+        if pooling_type == PoolingType.LAST:
+            assert step_tag_id is None and returned_token_ids is None
+            return LastPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.ALL:
+            assert step_tag_id is None and returned_token_ids is None
+            return AllPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.CLS:
+            assert step_tag_id is None and returned_token_ids is None
+            return CLSPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.MEAN:
+            assert step_tag_id is None and returned_token_ids is None
+            return MeanPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.STEP:
+            return StepPool(normalize=normalize,
+                            softmax=softmax,
+                            step_tag_id=step_tag_id,
+                            returned_token_ids=returned_token_ids)
+
+        assert_never(pooling_type)
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
+        super().__init__()
+
+        self.head = PoolerHead(normalize=normalize, softmax=softmax)
+
+    def get_prompt_lens(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor:
+        return PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        raise NotImplementedError
+
+    def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput:
+        return PoolingSequenceGroupOutput(data)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data)
+        pooled_outputs = [self.build_output(data) for data in pooled_data]
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class CLSPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        first_token_flat_indices = torch.zeros_like(prompt_lens)
+        first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
+        return hidden_states[first_token_flat_indices]
+
+
+class LastPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
+        return hidden_states[last_token_flat_indices]
+
+
+class AllPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len in prompt_lens:
+            pooled_data.append(hidden_states[offset:offset + prompt_len])
+            offset += prompt_len
+
+        return pooled_data
+
+
+class MeanPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        cumsum = torch.cumsum(hidden_states, dim=0)
+        start_indices = torch.cat([
+            torch.tensor([0], device=hidden_states.device),
+            torch.cumsum(prompt_lens[:-1], dim=0)
+        ])
+        end_indices = torch.cumsum(prompt_lens, dim=0)
+        return (cumsum[end_indices - 1] - cumsum[start_indices] +
+                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
+
+
+class StepPool(SimplePooler):
+
     def __init__(
         self,
-        pooling_type: PoolingType,
+        *,
         normalize: bool,
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
     ):
+        super().__init__(normalize=normalize, softmax=softmax)
+
+        self.step_tag_id = step_tag_id
+        self.returned_token_ids = returned_token_ids
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        returned_token_ids = self.returned_token_ids
+        if returned_token_ids is not None and len(returned_token_ids) > 0:
+            hidden_states = hidden_states[:, returned_token_ids]
+
+        step_tag_id = self.step_tag_id
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len, seq_data_i in zip(prompt_lens,
+                                          pooling_metadata.seq_data.values()):
+            pooled_data_i = hidden_states[offset:offset + prompt_len]
+            if step_tag_id is not None:
+                token_ids = torch.tensor(seq_data_i.prompt_token_ids)
+                pooled_data_i = pooled_data_i[token_ids == step_tag_id]
+
+            offset += prompt_len
+            pooled_data.append(pooled_data_i)
+
+        return pooled_data
+
+
+class PoolerHead(nn.Module):
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
         super().__init__()
 
-        self.pooling_type = pooling_type
         self.normalize = normalize
         self.softmax = softmax
-        self.step_tag_id = step_tag_id
-        self.returned_token_ids = returned_token_ids
+
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]):
+        if self.normalize:
+            if isinstance(pooled_data, list):
+                pooled_data = [
+                    F.normalize(data, p=2, dim=1) for data in pooled_data
+                ]
+            else:
+                pooled_data = F.normalize(pooled_data, p=2, dim=1)
+
+        if self.softmax:
+            if isinstance(pooled_data, list):
+                pooled_data = [F.softmax(data, dim=-1) for data in pooled_data]
+            else:
+                pooled_data = F.softmax(pooled_data, dim=-1)
+
+        return pooled_data
+
+
+class Pooler(nn.Module):
 
     @classmethod
     def from_config_with_defaults(
@@ -60,8 +244,8 @@ def from_config_with_defaults(
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
-    ) -> "Pooler":
-        return cls(
+    ) -> SimplePooler:
+        return SimplePooler.from_pooling_type(
             pooling_type=PoolingType[pooler_config.pooling_type]
             if pooler_config.pooling_type is not None else pooling_type,
             normalize=pooler_config.normalize
@@ -75,85 +259,6 @@ def from_config_with_defaults(
             returned_token_ids,
         )
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        """Pools specific information from hidden states based on metadata."""
-
-        prompt_lens = PoolingTensors.from_pooling_metadata(
-            pooling_metadata, hidden_states.device).prompt_lens
-
-        if self.pooling_type is PoolingType.CLS:
-            first_token_flat_indices = torch.zeros_like(prompt_lens)
-            first_token_flat_indices[1:] += torch.cumsum(prompt_lens,
-                                                         dim=0)[:-1]
-            pooled_data = hidden_states[first_token_flat_indices]
-        elif self.pooling_type == PoolingType.LAST:
-            last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
-            pooled_data = hidden_states[last_token_flat_indices]
-        elif self.pooling_type == PoolingType.ALL:
-            offset = 0
-            pooled_data = []
-            for prompt_len in prompt_lens:
-                pooled_data.append(hidden_states[offset:offset + prompt_len])
-                offset += prompt_len
-        elif self.pooling_type == PoolingType.MEAN:
-            # Calculate mean pooling
-            cumsum = torch.cumsum(hidden_states, dim=0)
-            start_indices = torch.cat([
-                torch.tensor([0], device=hidden_states.device),
-                torch.cumsum(prompt_lens[:-1], dim=0)
-            ])
-            end_indices = torch.cumsum(prompt_lens, dim=0)
-            pooled_data = (
-                cumsum[end_indices - 1] - cumsum[start_indices] +
-                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
-        elif self.pooling_type == PoolingType.STEP:
-            returned_token_ids = self.returned_token_ids
-            if returned_token_ids is not None and len(returned_token_ids) > 0:
-                hidden_states = hidden_states[:, returned_token_ids]
-
-            step_tag_id = self.step_tag_id
-
-            offset = 0
-            pooled_data = []
-            for prompt_len, seq_data_i in zip(
-                    prompt_lens, pooling_metadata.seq_data.values()):
-                pooled_data_i = hidden_states[offset:offset + prompt_len]
-                if step_tag_id is not None:
-                    token_ids = torch.tensor(seq_data_i.prompt_token_ids)
-                    pooled_data_i = pooled_data_i[token_ids == step_tag_id]
-
-                offset += prompt_len
-                pooled_data.append(pooled_data_i)
-        else:
-            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
-
-        if self.normalize:
-            if isinstance(pooled_data, list):
-                pooled_data = [
-                    nn.functional.normalize(data, p=2, dim=1)
-                    for data in pooled_data
-                ]
-            else:
-                pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
-
-        if self.softmax:
-            if isinstance(pooled_data, list):
-                pooled_data = [
-                    nn.functional.softmax(data, dim=-1) for data in pooled_data
-                ]
-            else:
-                pooled_data = nn.functional.softmax(pooled_data, dim=-1)
-
-        pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
-        ]
-
-        return PoolerOutput(outputs=pooled_outputs)
-
 
 class CrossEncodingPooler(nn.Module):
     """A layer that pools specific information from hidden states.
@@ -208,9 +313,8 @@ def forward(
         if self.pooler is not None:
             # apply classifier once on the full batch if possible
             pooled_output = self.classifier(pooled_output)
-        logits = self.default_activation_function(pooled_output)
 
-        pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in logits
-        ]
+        scores = self.default_activation_function(pooled_output).squeeze(-1)
+
+        pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 34c1332ac4a66..d179d6235424a 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -2,19 +2,20 @@
 from typing import List, Optional, Union
 
 import torch
-from torch import nn
+import torch.nn as nn
 from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
 from vllm.attention import AttentionMetadata
 from vllm.attention.backends.xformers import XFormersImpl
 from vllm.config import ModelConfig, VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.pooler import PoolerHead
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import (EmbeddingSequenceGroupOutput, IntermediateTensors,
-                           PoolerOutput)
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
+                           PoolingSequenceGroupOutput)
 
 logger = init_logger(__name__)
 
@@ -52,6 +53,8 @@ def tokens_to_ids(tokens: list[str]) -> array:
         self.embed_pattern_ids = tokens_to_ids(
             ["▁<", "|", "embed", "|", ">", "<0x0A>"])
 
+        self.head = PoolerHead(normalize=True, softmax=False)
+
     def _find_array(self, arr: array, target: array, start_idx: int) -> int:
         """
         Find the first occurrence of target in arr starting from start_idx.
@@ -75,7 +78,7 @@ def _find_array(self, arr: array, target: array, start_idx: int) -> int:
                 return i
         return -1
 
-    def _get_instruction_len(self, prompt_token_ids: array) -> bool:
+    def _get_instruction_len(self, prompt_token_ids: array) -> int:
         """
         Get the length of the instruction in the prompt.
 
@@ -168,10 +171,10 @@ def forward(
         mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
             1)
 
-        pooled_data = nn.functional.normalize(mean_embeddings, p=2, dim=1)
+        pooled_data = self.head(mean_embeddings)
 
         pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
+            PoolingSequenceGroupOutput(data) for data in pooled_data
         ]
 
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 86264f604f6bc..8c6c1aca3a917 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,9 +1,13 @@
 import time
+import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, Generic, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
+import torch
+from typing_extensions import TypeVar
+
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
@@ -57,14 +61,26 @@ class PoolingOutput:
     """The output data of one pooling output of a request.
 
     Args:
-        embedding: The embedding vector, which is a list of floats. The
-        length of vector depends on the model as listed in the embedding guide.
+        data: The extracted hidden states.
     """
-    embedding: List[float]
+    data: torch.Tensor
 
     def __repr__(self) -> str:
-        return (f"PoolingOutput("
-                f"embedding={len(self.embedding)})")
+        return (f"PoolingOutput(data={self.data})")
+
+    def __eq__(self, other: object) -> bool:
+        return (isinstance(other, self.__class__) and bool(
+            (self.data == other.data).all()))
+
+    @property
+    def embedding(self) -> list[float]:
+        msg = ("`LLM.encode()` now returns raw outputs. "
+               "To return embeddings, use `LLM.embed()`. "
+               "To return class probabilities, use `LLM.classify()` "
+               "and access the `probs` attribute. ")
+        warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+        return self.data.tolist()
 
 
 class RequestOutput:
@@ -316,7 +332,10 @@ def __repr__(self) -> str:
                 f"multi_modal_placeholders={self.multi_modal_placeholders})")
 
 
-class PoolingRequestOutput:
+_O = TypeVar("_O", default=PoolingOutput)
+
+
+class PoolingRequestOutput(Generic[_O]):
     """
     The output data of a pooling request to the LLM.
 
@@ -327,24 +346,24 @@ class PoolingRequestOutput:
         finished (bool): A flag indicating whether the pooling is completed.
     """
 
-    def __init__(self, request_id: str, outputs: "PoolingOutput",
+    def __init__(self, request_id: str, outputs: _O,
                  prompt_token_ids: List[int], finished: bool):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
         self.finished = finished
         self.outputs = outputs
 
-    @classmethod
-    def from_seq_group(cls,
-                       seq_group: 'SequenceGroup') -> "PoolingRequestOutput":
-        if seq_group.embeddings is None:
-            raise ValueError(
-                "Embeddings are missing in seq_group for EmbeddingRequest.")
-        output = PoolingOutput(seq_group.embeddings)
+    @staticmethod
+    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
+        pooled_data = seq_group.pooled_data
+        assert pooled_data is not None
+
+        output = PoolingOutput(pooled_data)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
 
-        return cls(seq_group.request_id, output, prompt_token_ids, finished)
+        return PoolingRequestOutput(seq_group.request_id, output,
+                                    prompt_token_ids, finished)
 
     def __repr__(self):
         """
@@ -356,89 +375,137 @@ def __repr__(self):
         Returns:
             str: A string representation of the PoolingRequestOutput instance.
         """
-        return (f"PoolingRequestOutput(request_id='{self.request_id}', "
-                f"outputs={repr(self.outputs)}, "
+        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
+                f"outputs={self.outputs!r}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
                 f"finished={self.finished})")
 
 
+class RequestOutputFactory:
+
+    @staticmethod
+    def create(seq_group: SequenceGroup,
+               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
+               use_cache: bool = False):
+        if seq_group.pooled_data is not None:
+            return PoolingRequestOutput.from_seq_group(seq_group)
+        else:
+            return RequestOutput.from_seq_group(seq_group, use_cache,
+                                                seq_id_to_seq_group)
+
+
 @dataclass
-class ScoreOutput:
-    """The output data of one completion output of a request.
+class EmbeddingOutput:
+    """The output data of one embedding output of a request.
 
     Args:
-        score: The score, which is a list of floats. 
-        index: The correspondent text index of the score.
+        embedding: The embedding vector, which is a list of floats.
+        Its length depends on the hidden dimension of the model.
     """
-    index: int
-    score: List[float]
+    embedding: list[float]
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D embedding vector")
+
+        return EmbeddingOutput(pooled_data.tolist())
+
+    @property
+    def hidden_size(self) -> int:
+        return len(self.embedding)
 
     def __repr__(self) -> str:
-        return (f"ScoreOutput("
-                f"score={self.score}), "
-                f"index={self.index})")
+        return f"EmbeddingOutput(hidden_size={self.hidden_size})"
 
 
-class ScoreRequestOutput:
-    """
-    The output data of an score request to the LLM.
+class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return EmbeddingRequestOutput(
+            request_id=request_output.request_id,
+            outputs=EmbeddingOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+
+
+@dataclass
+class ClassificationOutput:
+    """The output data of one classification output of a request.
 
     Args:
-        request_id (str): A unique identifier for the score request.
-        outputs (score): The embedding results for the given input.
+        probs: The probability vector, which is a list of floats.
+        Its length depends on the number of classes.
     """
+    probs: list[float]
 
-    def __init__(self, request_id: str, outputs: "ScoreOutput"):
-        self.request_id = request_id
-        self.outputs = outputs
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D probability vector")
 
-    def __repr__(self):
-        """
-        Returns a string representation of an ScoreRequestOutput instance.
+        return ClassificationOutput(pooled_data.tolist())
 
-        The representation includes the request_id and the number of outputs,
-        providing a quick overview of the embedding request's results.
+    @property
+    def num_classes(self) -> int:
+        return len(self.probs)
 
-        Returns:
-            str: A string representation of the ScoreRequestOutput instance.
-        """
-        return (f"ScoreRequestOutput(request_id='{self.request_id}', "
-                f"outputs={repr(self.outputs)}")
+    def __repr__(self) -> str:
+        return f"ClassificationOutput(num_classes={self.num_classes})"
 
 
-class RequestOutputFactory:
+class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
 
     @staticmethod
-    def create(seq_group: SequenceGroup,
-               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
-               use_cache: bool = False):
-        # Determine the type based on a condition, for example:
-        if hasattr(seq_group,
-                   'embeddings') and seq_group.embeddings is not None:
-            return PoolingRequestOutput.from_seq_group(seq_group)
-        else:
-            return RequestOutput.from_seq_group(seq_group, use_cache,
-                                                seq_id_to_seq_group)
+    def from_base(request_output: PoolingRequestOutput):
+        return ClassificationRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ClassificationOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
 
 
-def __getattr__(name: str):
-    import warnings
+@dataclass
+class ScoringOutput:
+    """The output data of one scoring output of a request.
 
-    if name == "EmbeddingOutput":
-        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
-               "The original name will be removed in an upcoming version.")
+    Args:
+        score: The similarity score, which is a scalar value.
+    """
+    score: float
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 0:
+            raise ValueError("pooled_data should be a scalar score")
 
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+        return ScoringOutput(pooled_data.item())
 
-        return PoolingOutput
+    def __repr__(self) -> str:
+        return f"ScoringOutput(score={self.score})"
 
-    if name == "EmbeddingRequestOutput":
-        msg = ("EmbeddingRequestOutput has been renamed to "
-               "PoolingRequestOutput. "
-               "The original name will be removed in an upcoming version.")
+    @property
+    def embedding(self) -> list[float]:
+        msg = ("`LLM.score()` now returns scalar scores. "
+               "Please access it via the `score` attribute. ")
+        warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+        return [self.score]
 
-        return PoolingRequestOutput
 
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ScoringRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ScoringOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
diff --git a/vllm/sequence.py b/vllm/sequence.py
index b0f3c1cc3609f..ddb9ca5944f10 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -617,10 +617,9 @@ class SequenceGroup:
         sampling_params: The sampling parameters used to generate the outputs.
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
-        embeddings: The embeddings vectors of the prompt of the sequence group
-            for a pooling model.
-        pooling_params: The pooling parameters used to generate the pooling
+        pooling_params: The parameters used to generate the pooler
             for a pooling model.
+        pooled_data: The extracted hidden states from a pooling model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
@@ -635,8 +634,8 @@ def __init__(
         arrival_time: float,
         sampling_params: Optional[SamplingParams] = None,
         lora_request: Optional[LoRARequest] = None,
-        embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
+        pooled_data: Optional[torch.Tensor] = None,
         encoder_seq: Optional[Sequence] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -658,8 +657,8 @@ def __init__(
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
-        self.embeddings = embeddings
         self.pooling_params = pooling_params
+        self.pooled_data = pooled_data
         self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
@@ -1033,8 +1032,8 @@ class CompletionSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    __metaclass__ = SequenceGroupOutput
     """The model output associated with a completion sequence group."""
+    __metaclass__ = SequenceGroupOutput
     samples: List[SequenceOutput]
     # Prompt logprob for each prompt query token.
     prompt_logprobs: Optional[PromptLogprobs]
@@ -1050,23 +1049,24 @@ def __eq__(self, other: object) -> bool:
                 and self.prompt_logprobs == other.prompt_logprobs)
 
 
-class EmbeddingSequenceGroupOutput(
+class PoolingSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True,  # type: ignore[call-arg]
 ):
-    """The model output associated with an embedding sequence group."""
+    """The model output associated with a pooling sequence group."""
     __metaclass__ = SequenceGroupOutput
-    embeddings: List[int]
+    # Annotated as Any to be compatible with msgspec
+    # The actual type is in SequenceGroup.pooled_data
+    data: Any
 
     def __repr__(self) -> str:
-        return (f"EmbeddingSequenceGroupOutput("
-                f"embeddings_shape={len(self.embeddings)})")
+        return f"PoolingSequenceGroupOutput(data={self.data}"
 
     def __eq__(self, other: object) -> bool:
-        if not isinstance(other, EmbeddingSequenceGroupOutput):
+        if not isinstance(other, PoolingSequenceGroupOutput):
             raise NotImplementedError()
-        return self.embeddings == other.embeddings
+        return self.data == other.data
 
 
 # cannot use msgspec.Struct here because Dynamo does not support it
@@ -1085,7 +1085,7 @@ def __getitem__(self, key: Union[str, slice]):
         elif isinstance(key, slice):
             return self.__class__({k: v[key] for k, v in self.tensors.items()})
 
-    def __setitem__(self, key: str, value):
+    def __setitem__(self, key: str, value: torch.Tensor):
         self.tensors[key] = value
 
     def __len__(self):
@@ -1103,16 +1103,12 @@ class PoolerOutput(
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
     """The output from a pooling operation in the pooling model."""
-    outputs: List[EmbeddingSequenceGroupOutput]
-
-    # lazy import to avoid circular import
-    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
-    spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None
+    outputs: List[PoolingSequenceGroupOutput]
 
-    def __getitem__(self, idx: int) -> EmbeddingSequenceGroupOutput:
+    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
         return self.outputs[idx]
 
-    def __setitem__(self, idx: int, value):
+    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
         self.outputs[idx] = value
 
     def __len__(self):
@@ -1385,8 +1381,8 @@ def add_request(request_id: str, engine, params, **kwargs):
             arrival_time=seq_group.arrival_time,
             sampling_params=original_params,
             lora_request=seq_group.lora_request,
-            embeddings=seq_group.embeddings,
             pooling_params=seq_group.pooling_params,
+            pooled_data=seq_group.pooled_data,
             encoder_seq=seq_group.encoder_seq,
             trace_headers=seq_group.trace_headers,
             prompt_adapter_request=seq_group.prompt_adapter_request,

From 969da7d70bc0539f6be12027b71bef758325a61a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 13 Dec 2024 03:09:30 -0800
Subject: [PATCH 33/85] [V1][VLM] Fix edge case bug for InternVL2 (#11165)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/internvl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 42c769f79e202..f4b7e4478c164 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -669,8 +669,11 @@ def _process_image_input(
         image_embeds = self.extract_feature(image_input["data"])
 
         patches_per_image = image_input["patches_per_image"]
+
+        # Only one image in the current batch
         if len(patches_per_image) == 1:
-            image_embeds = image_embeds.unsqueeze(0)
+            image_embeds = image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
             return image_embeds
 
         # NOTE: Image embeddings are split into separate tensors for each image

From d1fa714cb1c9a708d7da0de27c99f7eee07fe663 Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Fri, 13 Dec 2024 21:39:00 +0800
Subject: [PATCH 34/85] [Refactor]A simple device-related refactor (#11163)

Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 vllm/platforms/cpu.py       |  5 +++++
 vllm/platforms/hpu.py       |  9 +++++++++
 vllm/platforms/interface.py | 17 +++++++++++++++++
 vllm/platforms/neuron.py    |  9 +++++++++
 vllm/platforms/openvino.py  | 10 +++++-----
 vllm/platforms/xpu.py       |  5 +++++
 vllm/utils.py               | 27 +--------------------------
 7 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index e5142b985d1f2..aad8755d9fcd8 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -98,3 +98,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "vllm.worker.cpu_worker.CPUWorker"
             else:
                 parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 7f22bee3eaa74..2b947d280f9f8 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
@@ -9,6 +11,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
@@ -43,3 +47,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index db06d2c18e681..4150b0cdf836a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,6 +1,7 @@
 import enum
 import platform
 import random
+from platform import uname
 from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
@@ -16,6 +17,11 @@
 logger = init_logger(__name__)
 
 
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
     FLASH_ATTN_VLLM_V1 = enum.auto()
@@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum:
 
         return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
 
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 1e5c4bddfa24f..86113523385f6 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,5 +1,7 @@
 from typing import TYPE_CHECKING, Optional
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
@@ -7,6 +9,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
@@ -28,3 +32,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index e0f8e8b4b49fe..ccd94e8adb3b1 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -34,7 +34,7 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
         return _Backend.OPENVINO
 
     @classmethod
-    def get_device_name(self, device_id: int = 0) -> str:
+    def get_device_name(cls, device_id: int = 0) -> str:
         return "openvino"
 
     @classmethod
@@ -42,19 +42,19 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return False
 
     @classmethod
-    def inference_mode(self):
+    def inference_mode(cls):
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def is_openvino_cpu(self) -> bool:
+    def is_openvino_cpu(cls) -> bool:
         return "CPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_openvino_gpu(self) -> bool:
+    def is_openvino_gpu(cls) -> bool:
         return "GPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_pin_memory_available(self) -> bool:
+    def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on OpenViNO.")
         return False
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 11dbd04d55671..c20190e789d7e 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -78,3 +78,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.distributed_executor_backend = "ray"
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on XPU.")
+        return False
diff --git a/vllm/utils.py b/vllm/utils.py
index 1882264c19775..fbc3ef7fa7f89 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -24,7 +24,6 @@
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from functools import lru_cache, partial, wraps
-from platform import uname
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generic, Hashable, List, Literal, Optional,
                     OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
@@ -344,12 +343,6 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-@lru_cache(maxsize=None)
-def in_wsl() -> bool:
-    # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
-
-
 def make_async(
     func: Callable[P, T],
     executor: Optional[concurrent.futures.Executor] = None
@@ -729,25 +722,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
-
-    if in_wsl():
-        # Pinning memory in WSL is not supported.
-        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-        print_warning_once("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
-        return False
-    elif current_platform.is_xpu():
-        print_warning_once("Pin memory is not supported on XPU.")
-        return False
-    elif current_platform.is_neuron():
-        print_warning_once("Pin memory is not supported on Neuron.")
-        return False
-    elif current_platform.is_hpu():
-        print_warning_once("Pin memory is not supported on HPU.")
-        return False
-    elif current_platform.is_cpu() or current_platform.is_openvino():
-        return False
-    return True
+    return current_platform.is_pin_memory_available()
 
 
 class DeviceMemoryProfiler:

From c31d4a57a6b639900a7c70b6e844db0116c2f9f6 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Sat, 14 Dec 2024 00:51:25 +0900
Subject: [PATCH 35/85] [Core] support LoRA and prompt adapter in content-based
 hashing for Block Manager v2 prefix caching (#8240)

---
 tests/core/block/test_prefix_caching_block.py | 65 ++++++++++++++++++-
 tests/core/utils.py                           | 10 +++
 vllm/core/block/block_table.py                | 46 +++++++++----
 vllm/core/block/common.py                     | 19 ++++--
 vllm/core/block/cpu_gpu_block_allocator.py    | 43 ++++++++----
 vllm/core/block/interfaces.py                 | 32 ++++++---
 vllm/core/block/naive_block.py                | 10 ++-
 vllm/core/block/prefix_caching_block.py       | 55 ++++++++++++----
 vllm/core/block_manager.py                    |  8 ++-
 vllm/sequence.py                              | 13 ++++
 10 files changed, 246 insertions(+), 55 deletions(-)

diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index bbeb4b3a58f2a..29ac3a3c86cb4 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from tests.core.utils import create_dummy_sequence
+from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block, BlockAllocator
 from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
@@ -801,6 +801,7 @@ def create_immutable_chain(
         block_size: int,
         token_ids: List[int],
         allocator: PrefixCachingBlockAllocator,
+        extra_hash: Optional[int] = None,
     ) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
@@ -816,7 +817,9 @@ def create_immutable_chain(
                                         block_size:(block_number + 1) *
                                         block_size]
             prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=block_token_ids)
+                prev_block=prev_block,
+                token_ids=block_token_ids,
+                extra_hash=extra_hash)
             blocks.append(prev_block)
 
         return blocks
@@ -931,3 +934,61 @@ def test_correct_block_hash():
         allocator.mark_blocks_as_computed([])
 
         assert tracker.get_num_cached_tokens(seq) == len(tokens)
+
+    @staticmethod
+    def test_correct_extra_hash():
+        """
+        Test that the block hash is correctly computed based on the extra hash,
+        ensuring it matches the allocator's block hash, specifically for the
+        LoRA case, and that the correct number of cached tokens is retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))
+
+        # Create a dummy LoRA sequence with a specific LoRA ID.
+        lora_seq = create_dummy_lora_sequence(request_id=0,
+                                              token_ids=tokens,
+                                              block_size=block_size,
+                                              lora_int_id=1)
+
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+            extra_hash=lora_seq.extra_hash(),
+        )
+
+        allocator.mark_blocks_as_computed([])
+
+        # Create different dummy sequences that have the same token IDs
+        # but different LoRA IDs.
+        seq = create_dummy_sequence(request_id=1,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+
+        different_lora_seq = create_dummy_lora_sequence(request_id=2,
+                                                        token_ids=tokens,
+                                                        block_size=block_size,
+                                                        lora_int_id=2)
+
+        # Due to the different LoRA IDs, corresponding blocks are not cached.
+        assert tracker.get_num_cached_tokens(seq) == 0
+        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
+
+        # The number of cached tokens matches the length of the tokens
+        # for the cached LoRA sequence.
+        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 277368b57b938..16703cd19fa1e 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -46,6 +46,16 @@ def create_dummy_prompt(
     return prompt, seq_group
 
 
+def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
+                               block_size: int, lora_int_id: int) -> Sequence:
+    return Sequence(seq_id=request_id,
+                    inputs=token_inputs(token_ids),
+                    block_size=block_size,
+                    lora_request=LoRARequest(lora_name="dummy",
+                                             lora_path="/dummy",
+                                             lora_int_id=lora_int_id))
+
+
 def create_dummy_sequence(request_id: int, token_ids: List[int],
                           block_size: int) -> Sequence:
     return Sequence(
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d10cb29ef4a7c..dca0b3fe8d304 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -80,7 +80,8 @@ def get_num_required_blocks(token_ids: List[int],
 
     def allocate(self,
                  token_ids: List[int],
-                 device: Device = Device.GPU) -> None:
+                 device: Device = Device.GPU,
+                 extra_hash: Optional[int] = None) -> None:
         """Allocates memory blocks for storing the given sequence of token IDs.
 
         This method allocates the required number of blocks to store the given
@@ -90,12 +91,16 @@ def allocate(self,
             token_ids (List[int]): The sequence of token IDs to be stored.
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefixcaching block.
         """
         assert not self._is_allocated
         assert token_ids
         blocks = self._allocate_blocks_for_token_ids(prev_block=None,
                                                      token_ids=token_ids,
-                                                     device=device)
+                                                     device=device,
+                                                     extra_hash=extra_hash)
         self.update(blocks)
         self._num_full_slots = len(token_ids)
 
@@ -108,7 +113,8 @@ def update(self, blocks: List[Block]) -> None:
     def append_token_ids(self,
                          token_ids: List[int],
                          num_lookahead_slots: int = 0,
-                         num_computed_slots: Optional[int] = None) -> None:
+                         num_computed_slots: Optional[int] = None,
+                         extra_hash: Optional[int] = None) -> None:
         """Appends a sequence of token IDs to the existing blocks in the
         BlockTable.
 
@@ -130,6 +136,9 @@ def append_token_ids(self,
                 Without sliding window, None can be passed.
                 Without chunked prefill, it should be the same as
                 _num_full_slots.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         assert self._is_allocated, "no blocks have been allocated"
         assert len(self._blocks) > 0
@@ -149,7 +158,8 @@ def append_token_ids(self,
         # Ensure there are enough empty slots for the new tokens plus
         # lookahead slots
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
-                                    num_lookahead_slots)
+                                    num_lookahead_slots,
+                                    extra_hash=extra_hash)
 
         # Update the blocks with the new tokens
         first_block_idx = self._num_full_slots // self._block_size
@@ -160,7 +170,9 @@ def append_token_ids(self,
 
         self._num_full_slots += len(token_ids)
 
-    def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
+    def ensure_num_empty_slots(self,
+                               num_empty_slots: int,
+                               extra_hash: Optional[int] = None) -> None:
         """Ensures that the BlockTable has at least the specified number of
         empty slots available.
 
@@ -171,6 +183,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
 
         Args:
             num_empty_slots (int): The minimum number of empty slots required.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         # Currently the block table only supports
         # appending tokens to GPU blocks.
@@ -187,7 +202,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
             assert len(self._blocks) > 0
             self._blocks.append(
                 self._allocator.allocate_mutable_block(
-                    prev_block=self._blocks[-1], device=device))
+                    prev_block=self._blocks[-1],
+                    device=device,
+                    extra_hash=extra_hash))
 
     def fork(self) -> "BlockTable":
         """Creates a new BlockTable instance with a copy of the blocks from the
@@ -259,9 +276,12 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
         # ones after the appended ones.
         return sequence_token_ids[self.num_full_slots:]
 
-    def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
-                                       token_ids: List[int],
-                                       device: Device) -> List[Block]:
+    def _allocate_blocks_for_token_ids(
+            self,
+            prev_block: Optional[Block],
+            token_ids: List[int],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         blocks: List[Block] = []
 
         block_token_ids = []
@@ -275,8 +295,10 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
         if block_token_ids:
             blocks.extend(
                 self._allocator.allocate_immutable_blocks(
-                    prev_block, block_token_ids=block_token_ids,
-                    device=device))
+                    prev_block,
+                    block_token_ids=block_token_ids,
+                    device=device,
+                    extra_hash=extra_hash))
             prev_block = blocks[-1]
 
         if tail_token_ids:
@@ -284,7 +306,7 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
             cur_token_ids = tail_token_ids[0]
 
             block = self._allocator.allocate_mutable_block(
-                prev_block=prev_block, device=device)
+                prev_block=prev_block, device=device, extra_hash=extra_hash)
             block.append_token_ids(cur_token_ids)
 
             blocks.append(block)
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index eb190adfbe802..c03b5932eafb6 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -177,7 +177,8 @@ def __init__(self, block_size: int, create_block: Block.Factory,
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
+                                   block_id=None,
+                                   extra_hash=None))
 
     def increase_pool(self):
         """Doubles the internal pool size
@@ -194,10 +195,15 @@ def increase_pool(self):
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
-
-    def init_block(self, prev_block: Optional[Block], token_ids: List[int],
-                   block_size: int, physical_block_id: Optional[int]) -> Block:
+                                   block_id=None,
+                                   extra_hash=None))
+
+    def init_block(self,
+                   prev_block: Optional[Block],
+                   token_ids: List[int],
+                   block_size: int,
+                   physical_block_id: Optional[int],
+                   extra_hash: Optional[int] = None) -> Block:
         if len(self._free_ids) == 0:
             self.increase_pool()
             assert len(self._free_ids) > 0
@@ -210,7 +216,8 @@ def init_block(self, prev_block: Optional[Block], token_ids: List[int],
             token_ids=token_ids,
             block_size=block_size,
             allocator=block._allocator,  # type: ignore[attr-defined] 
-            block_id=physical_block_id)
+            block_id=physical_block_id,
+            extra_hash=extra_hash)
         block.pool_id = pool_id  # type: ignore[attr-defined]
         return block
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 3197af3c2b7a4..3a57487a6cd8a 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -121,23 +121,32 @@ def allocate_or_get_null_block(self) -> Block:
                 self.allocate_mutable_block(None, Device.GPU))
         return self._null_block
 
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
             prev_block (Optional[Block]): The previous block to in the sequence.
                 Used for prefix hashing.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated mutable block.
         """
-        return self._allocators[device].allocate_mutable_block(prev_block)
-
-    def allocate_immutable_blocks(self, prev_block: Optional[Block],
-                                  block_token_ids: List[List[int]],
-                                  device: Device) -> List[Block]:
+        return self._allocators[device].allocate_mutable_block(
+            prev_block, extra_hash=extra_hash)
+
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         """Allocates a new group of immutable blocks with the provided block 
         token IDs on the specified device.
 
@@ -147,17 +156,22 @@ def allocate_immutable_blocks(self, prev_block: Optional[Block],
             block_token_ids (List[int]): The list of block token IDs to be 
                 stored in the new blocks.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             List[Block]: The newly allocated list of immutable blocks 
                 containing the provided block token IDs.
         """
         return self._allocators[device].allocate_immutable_blocks(
-            prev_block, block_token_ids)
+            prev_block, block_token_ids, extra_hash=extra_hash)
 
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -167,13 +181,16 @@ def allocate_immutable_block(self, prev_block: Optional[Block],
             token_ids (List[int]): The list of token IDs to be stored in the new
                 block.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
         return self._allocators[device].allocate_immutable_block(
-            prev_block, token_ids)
+            prev_block, token_ids, extra_hash=extra_hash)
 
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
@@ -387,6 +404,10 @@ def is_full(self):
     def prev_block(self):
         return self._proxy.prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def computed(self):
         return self._proxy.computed
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 06f4851af3466..985a1098b6cd1 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -50,6 +50,11 @@ def is_full(self) -> bool:
     def prev_block(self) -> Optional["Block"]:
         pass
 
+    @property
+    @abstractmethod
+    def extra_hash(self) -> Optional[int]:
+        return None
+
     @property
     @abstractmethod
     def computed(self) -> bool:
@@ -81,6 +86,8 @@ def __call__(
             block_size: int,
             allocator: "BlockAllocator",
             block_id: Optional[int] = None,
+            computed: bool = False,
+            extra_hash: Optional[int] = None,
         ) -> "Block":
             pass
 
@@ -99,18 +106,20 @@ def content_hash(self) -> Optional[int]:
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
     def allocate_immutable_block(self, prev_block: Optional[Block],
-                                 token_ids: List[int]) -> Block:
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_blocks(
-            self, prev_block: Optional[Block],
-            block_token_ids: List[List[int]]) -> List[Block]:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  extra_hash: Optional[int]) -> List[Block]:
         pass
 
     @abstractmethod
@@ -197,14 +206,18 @@ def find_cached_blocks_prefix(
 class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
@@ -213,6 +226,7 @@ def allocate_immutable_blocks(
         prev_block: Optional[Block],
         block_token_ids: List[List[int]],
         device: Device,
+        extra_hash: Optional[int] = None,
     ) -> List[Block]:
         pass
 
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index a2af5ad6362c1..9b94918ab38ef 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -63,6 +63,7 @@ def __init__(
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
         the previous block.
@@ -85,6 +86,7 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         assert device is None
         num_blocks = len(block_token_ids)
@@ -106,6 +108,7 @@ def allocate_immutable_blocks(
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block, linked to the previous block.
 
@@ -355,7 +358,8 @@ def __init__(self,
                  block_size: int,
                  allocator: BlockAllocator,
                  block_id: Optional[int] = None,
-                 _cow_target: Optional[Block] = None):
+                 _cow_target: Optional[Block] = None,
+                 extra_hash: Optional[int] = None):
         self._token_ids: List[int] = []
         self._block_size = block_size
         self._prev_block = prev_block
@@ -441,6 +445,10 @@ def block_size(self) -> int:
     def prev_block(self) -> Optional["Block"]:
         return self._prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def content_hash(self) -> Optional[int]:
         return None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index b736167f6ceb4..1238303234deb 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -126,6 +126,7 @@ def _create_block(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ) -> Block:
         # Bind block to self.
         allocator = self
@@ -137,11 +138,13 @@ def _create_block(
             block_id=block_id,
             allocator=allocator,
             computed=computed,
+            extra_hash=extra_hash,
         )
 
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
@@ -160,7 +163,8 @@ def allocate_immutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=token_ids,
                                             block_size=self._block_size,
-                                            physical_block_id=None)
+                                            physical_block_id=None,
+                                            extra_hash=extra_hash)
         assert block.content_hash is not None
 
         cached_block_id = self._cached_blocks.get(block.content_hash, None)
@@ -173,7 +177,7 @@ def allocate_immutable_block(self,
         self._block_pool.free_block(block)
 
         # No cached block => Allocate a new block
-        block = self.allocate_mutable_block(prev_block)
+        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
         block.append_token_ids(token_ids)
         return block
 
@@ -181,17 +185,20 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         blocks = []
         for token_ids in block_token_ids:
             prev_block = self.allocate_immutable_block(prev_block=prev_block,
                                                        token_ids=token_ids,
-                                                       device=device)
+                                                       device=device,
+                                                       extra_hash=extra_hash)
             blocks.append(prev_block)
         return blocks
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a mutable block. If there are no free blocks, this will
         evict unused cached blocks.
@@ -210,7 +217,8 @@ def allocate_mutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=[],
                                             block_size=self._block_size,
-                                            physical_block_id=block_id)
+                                            physical_block_id=block_id,
+                                            extra_hash=extra_hash)
         assert not block.computed
         assert block.content_hash is None
         return block
@@ -382,7 +390,8 @@ def fork(self, last_block: Block) -> List[Block]:
                 prev_block=prev_block,
                 token_ids=block.token_ids,
                 block_size=self._block_size,
-                physical_block_id=block_id)
+                physical_block_id=block_id,
+                extra_hash=block.extra_hash)
 
             forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
@@ -608,10 +617,12 @@ def swap_in(self, blocks: List[Block]) -> None:
             # existing "block" object
             if block.is_full:
                 tmp_block = self.allocate_immutable_block(
-                    prev_block=block.prev_block, token_ids=block.token_ids)
+                    prev_block=block.prev_block,
+                    token_ids=block.token_ids,
+                    extra_hash=block.extra_hash)
             else:
                 tmp_block = self.allocate_mutable_block(
-                    prev_block=block.prev_block)
+                    prev_block=block.prev_block, extra_hash=block.extra_hash)
                 tmp_block.append_token_ids(block.token_ids)
 
             block_id = tmp_block.block_id
@@ -679,6 +690,8 @@ class PrefixCachingBlock(Block):
             caching block allocator associated with this block.
         block_id (Optional[int], optional): The physical block index
             of this block. Defaults to None.
+        extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
     """
 
     def __init__(
@@ -689,6 +702,7 @@ def __init__(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ):
         assert isinstance(allocator, PrefixCachingBlockAllocator), (
             "Currently this class is only tested with "
@@ -702,6 +716,7 @@ def __init__(
         self._allocator = allocator
         self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
         self._computed = computed
+        self._extra_hash = extra_hash
 
         # On the first time, we create the block object, and next we only
         # reinitialize it
@@ -811,6 +826,10 @@ def token_ids(self) -> List[int]:
     def prev_block(self) -> Optional[Block]:
         return self._prev_block
 
+    @property
+    def extra_hash(self) -> Optional[int]:
+        return self._extra_hash
+
     @property
     def content_hash(self) -> Optional[int]:
         """Return the content-based hash of the current block, or None if it is
@@ -841,18 +860,19 @@ def content_hash(self) -> Optional[int]:
         self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
             is_first_block,
             prev_block_hash,
-            cur_block_token_ids=self.token_ids)
+            cur_block_token_ids=self.token_ids,
+            extra_hash=self._extra_hash)
         return self._cached_content_hash
 
     @staticmethod
-    def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
-                          cur_block_token_ids: List[int]) -> int:
+    def hash_block_tokens(is_first_block: bool,
+                          prev_block_hash: Optional[int],
+                          cur_block_token_ids: List[int],
+                          extra_hash: Optional[int] = None) -> int:
         """Computes a hash value corresponding to the contents of a block and
         the contents of the preceding block(s). The hash value is used for
         prefix caching.
 
-        NOTE: Content-based hashing does not yet support LoRA.
-
         Parameters:
         - is_first_block (bool): A flag indicating if the block is the first in
             the sequence.
@@ -860,12 +880,15 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
             if this is the first block.
         - cur_block_token_ids (List[int]): A list of token ids in the current
             block. The current block is assumed to be full.
+        - extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
 
         Returns:
         - int: The computed hash value for the block.
         """
         assert (prev_block_hash is None) == is_first_block
-        return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
+        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
+                     extra_hash))
 
 
 class ComputedBlocksTracker:
@@ -935,12 +958,18 @@ def _update_seq_hashes(self, seq: Sequence) -> None:
             assert len(token_ids) >= (i + 1) * self._block_size
             block_token_ids = token_ids[i * self._block_size:(i + 1) *
                                         self._block_size]
+
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
             # This has to be kept in sync with the allocator's hash
             # calculation.
             block_hash = PrefixCachingBlock.hash_block_tokens(
                 is_first_block=prev_block_hash is None,
                 prev_block_hash=prev_block_hash,
                 cur_block_token_ids=block_token_ids,
+                extra_hash=extra_hash,
             )
             block_hashes_recorded.append(block_hash)
             prev_block_hash = block_hash
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 209487c6b4f9e..b41e848221882 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -151,8 +151,13 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
             max_block_sliding_window=self.max_block_sliding_window,
         )
         if seq.get_token_ids():
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
             # Add blocks to the block table only if the sequence is non empty.
-            block_table.allocate(seq.get_token_ids())
+            block_table.allocate(token_ids=seq.get_token_ids(),
+                                 extra_hash=extra_hash)
 
         return block_table
 
@@ -238,6 +243,7 @@ def append_slots(
             token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
             num_lookahead_slots=num_lookahead_slots,
             num_computed_slots=seq.data.get_num_computed_tokens(),
+            extra_hash=seq.extra_hash(),
         )
         # Return any new copy-on-writes.
         new_cows = self.block_allocator.clear_copy_on_writes()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ddb9ca5944f10..cc3d96fc93a79 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -527,6 +527,19 @@ def hash_of_block(self, logical_idx: int) -> int:
         hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
         return hash((hashed_tokens, self.lora_int_id))
 
+    def extra_hash(self) -> Optional[int]:
+        """
+        This function computes an extra hash for a sequence, specifically
+        designed for prefix caching mode. The final sequence hash is determined
+        by applying token_ids from the sequence's blocks.
+        """
+        if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
+            return None
+
+        # NOTE: If there are additional factors influencing the block aside from
+        # token_ids, include them as input parameters to the hash.
+        return hash((self.prompt_adapter_id, self.lora_int_id))
+
     def num_hashed_tokens_of_block(self, logical_idx: int):
         return logical_idx * self.block_size + self.block_size
 

From 5b0ed8391d497439595a1968d65df93da98265ca Mon Sep 17 00:00:00 2001
From: zhangjf <1061683512@qq.com>
Date: Fri, 13 Dec 2024 23:56:19 +0800
Subject: [PATCH 36/85] [Bugfix] using len(tokenizer) instead of
 tokenizer.vocab_size in AllowedTokenIdsLogitsProcessor (#11156)

---
 vllm/entrypoints/openai/logits_processors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 7913f8720ca73..c8132811de903 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -71,7 +71,7 @@ def get_logits_processors(
 
         # Check if token_id is within the vocab size
         for token_id, bias in clamped_logit_bias.items():
-            if token_id < 0 or token_id >= tokenizer.vocab_size:
+            if token_id < 0 or token_id >= len(tokenizer):
                 raise ValueError(f"token_id {token_id} in logit_bias contains "
                                  "out-of-vocab token id")
 
@@ -81,6 +81,6 @@ def get_logits_processors(
     if allowed_token_ids is not None:
         logits_processors.append(
             _get_allowed_token_ids_logits_processor(
-                frozenset(allowed_token_ids), tokenizer.vocab_size))
+                frozenset(allowed_token_ids), len(tokenizer)))
 
     return logits_processors

From 238c0d93b40008244fae64530d82f1860b1f9121 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Fri, 13 Dec 2024 11:19:10 -0500
Subject: [PATCH 37/85] [Misc] Add tokenizer_mode param to benchmark_serving.py
 (#11174)

Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
---
 benchmarks/benchmark_serving.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 3256692142c5e..4eb0e1f8ac903 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -781,6 +781,7 @@ def main(args: argparse.Namespace):
     backend = args.backend
     model_id = args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
 
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
@@ -790,6 +791,7 @@ def main(args: argparse.Namespace):
         base_url = f"http://{args.host}:{args.port}"
 
     tokenizer = get_tokenizer(tokenizer_id,
+                              tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
 
     if args.dataset is not None:
@@ -1210,5 +1212,15 @@ def main(args: argparse.Namespace):
         "from the sampled HF dataset.",
     )
 
+    parser.add_argument(
+        '--tokenizer-mode',
+        type=str,
+        default="auto",
+        choices=['auto', 'slow', 'mistral'],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        'always use the slow tokenizer. \n* '
+        '"mistral" will always use the `mistral_common` tokenizer.')
+
     args = parser.parse_args()
     main(args)

From 0920ab9131274df143cfc49245409378a009b3c6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 14 Dec 2024 00:22:22 +0800
Subject: [PATCH 38/85] [Doc] Reorganize online pooling APIs (#11172)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/pooling_models.rst         |  14 +-
 .../serving/openai_compatible_server.md       | 462 ++++++++++--------
 docs/source/usage/multimodal_inputs.rst       |   8 +-
 examples/offline_inference_openai.md          |  92 ++--
 ...ai_chat_embedding_client_for_multimodal.py |   2 +-
 examples/openai_cross_encoder_score.py        |  35 +-
 tests/entrypoints/openai/test_score.py        |   6 +-
 vllm/entrypoints/openai/api_server.py         |  11 +-
 vllm/entrypoints/openai/protocol.py           |   7 +-
 vllm/outputs.py                               |  19 +-
 10 files changed, 368 insertions(+), 288 deletions(-)

diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
index 94475c5e6689d..4e67677a2767a 100644
--- a/docs/source/models/pooling_models.rst
+++ b/docs/source/models/pooling_models.rst
@@ -50,10 +50,10 @@ It returns the extracted hidden states directly, which is useful for reward mode
 .. code-block:: python
 
     llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
-    output, = llm.encode("Hello, my name is")
+    (output,) = llm.encode("Hello, my name is")
 
     data = output.outputs.data
-    print(f"Prompt: {prompt!r} | Data: {data!r}")
+    print(f"Data: {data!r}")
 
 ``LLM.embed``
 ^^^^^^^^^^^^^
@@ -64,7 +64,7 @@ It is primarily designed for embedding models.
 .. code-block:: python
 
     llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-    output, = llm.embed("Hello, my name is")
+    (output,) = llm.embed("Hello, my name is")
 
     embeds = output.outputs.embedding
     print(f"Embeddings: {embeds!r} (size={len(embeds)})")
@@ -80,7 +80,7 @@ It is primarily designed for classification models.
 .. code-block:: python
 
     llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
-    output, = llm.classify("Hello, my name is")
+    (output,) = llm.classify("Hello, my name is")
 
     probs = output.outputs.probs
     print(f"Class Probabilities: {probs!r} (size={len(probs)})")
@@ -102,8 +102,8 @@ These types of models serve as rerankers between candidate query-document pairs
 .. code-block:: python
 
     llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
-    output, = llm.score("What is the capital of France?",
-                        "The capital of Brazil is Brasilia.")
+    (output,) = llm.score("What is the capital of France?",
+                          "The capital of Brazil is Brasilia.")
 
     score = output.outputs.score
     print(f"Score: {score}")
@@ -119,7 +119,7 @@ Please click on the above link for more details on how to launch the server.
 Embeddings API
 ^^^^^^^^^^^^^^
 
-Our Embeddings API is similar to ``LLM.encode``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
+Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
 
 The text-only API is compatible with `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
 so that you can use OpenAI client to interact with it.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index f75653106cf66..14a5b02d72aa5 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -1,13 +1,13 @@
 # OpenAI Compatible Server
 
-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
 
-You can start the server using Python, or using [Docker](deploying_with_docker.rst):
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
-To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
+To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
 ```python
 from openai import OpenAI
 client = OpenAI(
@@ -25,166 +25,76 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```
 
-## API Reference
+## Supported APIs
 
 We currently support the following OpenAI APIs:
 
-- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+- [Completions API](#completions-api) (`/v1/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`).
   - *Note: `suffix` parameter is not supported.*
-- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
   - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
     - *Note: `image_url.detail` parameter is not supported.*
   - We also support `audio_url` content type for audio files.
     - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
     - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
-- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
-  - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
-    which will be treated as a single prompt to the model according to its chat template.
-    - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
-  - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
-
-## Score API for Cross Encoder Models
+- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
+  - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
 
-vLLM supports *cross encoders models* at the **/v1/score** endpoint, which is not an OpenAI API standard endpoint. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+In addition, we have the following custom APIs:
 
-A ***Cross Encoder*** takes exactly two sentences / texts as input and either predicts a score or label for this sentence pair. It can for example predict the similarity of the sentence pair on a scale of 0 … 1.
+- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
+  - Applicable to any model with a tokenizer.
+- [Score API](#score-api) (`/score`)
+  - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`).
 
-### Example of usage for a pair of a string and a list of texts
+(chat-template)=
+## Chat Template
 
-In this case, the model will compare the first given text to each of the texts containing the list.
+In order for the language model to support chat protocol, vLLM requires the model to include
+a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
+specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
 
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "text_1": "What is the capital of France?",
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
-  ]
-}'
-```
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
 
-Response:
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
+template, or the template in string form. Without a chat template, the server will not be able to process chat
+and all chat requests will error.
 
 ```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693570,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        0.001094818115234375
-      ]
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
-### Example of usage for a pair of two lists of texts
-
-In this case, the model will compare the one by one, making pairs by same index correspondent in each list.
+vLLM community provides a set of chat templates for popular models. You can find them in the examples
+directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "encoding_format": "float",
-  "text_1": [
-    "What is the capital of Brazil?",
-    "What is the capital of France?"
-  ],
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
+With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
+both a `type` and a `text` field. An example is provided below:
+```python
+completion = client.chat.completions.create(
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
+  messages=[
+    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
   ]
-}'
-```
-
-Response:
-
-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        1
-      ]
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
+)
 ```
 
-### Example of usage for a pair of two strings
-
-In this case, the model will compare the strings of texts.
-
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "encoding_format": "float",
-  "text_1": "What is the capital of France?",
-  "text_2": "The capital of France is Paris."
-}'
-```
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
+`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
+request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
+*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
+the detected format, which can be one of:
 
-Response:
+- `"string"`: A string.
+  - Example: `"Hello world"`
+- `"openai"`: A list of dictionaries, similar to OpenAI schema.
+  - Example: `[{"type": "text", "text": "Hello world!"}]`
 
-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
-```
+If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
+to override which format to use.
 
 ## Extra Parameters
 
@@ -204,7 +114,7 @@ completion = client.chat.completions.create(
 )
 ```
 
-### Extra HTTP Headers
+## Extra HTTP Headers
 
 Only `X-Request-Id` HTTP request header is supported for now.
 
@@ -230,7 +140,53 @@ completion = client.completions.create(
 print(completion._request_id)
 ```
 
-### Extra Parameters for Completions API
+## CLI Reference
+
+(vllm-serve)=
+### `vllm serve`
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+
+```{argparse}
+:module: vllm.entrypoints.openai.cli_args
+:func: create_parser_for_docs
+:prog: vllm serve
+```
+
+#### Configuration file
+
+You can load CLI arguments via a [YAML](https://yaml.org/) config file.
+The argument names must be the long form of those outlined [above](#vllm-serve).
+
+For example:
+
+```yaml
+# config.yaml
+
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+To use the above config file:
+
+```bash
+$ vllm serve SOME_MODEL --config config.yaml
+```
+
+```{note}
+In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
+The order of priorities is `command line > config file values > defaults`.
+```
+
+## API Reference
+
+(completions-api)=
+### Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details.
+
+#### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
@@ -248,7 +204,12 @@ The following extra parameters are supported:
 :end-before: end-completion-extra-params
 ```
 
-### Extra Parameters for Chat Completions API
+(chat-api)=
+### Chat Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
+
+#### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
@@ -266,7 +227,19 @@ The following extra parameters are supported:
 :end-before: end-chat-completion-extra-params
 ```
 
-### Extra Parameters for Embeddings API
+(embeddings-api)=
+### Embeddings API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details.
+
+If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api))
+which will be treated as a single prompt to the model.
+
+```{tip}
+This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
+```
+
+#### Extra parameters
 
 The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
@@ -276,7 +249,7 @@ The following [pooling parameters (click through to see documentation)](../dev/p
 :end-before: end-embedding-pooling-params
 ```
 
-The following extra parameters are supported:
+The following extra parameters are supported by default:
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -284,80 +257,179 @@ The following extra parameters are supported:
 :end-before: end-embedding-extra-params
 ```
 
-## Chat Template
+For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-In order for the language model to support chat protocol, vLLM requires the model to include
-a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-chat-embedding-extra-params
+:end-before: end-chat-embedding-extra-params
+```
 
-An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
+(tokenizer-api)=
+### Tokenizer API
 
-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
-you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
-template, or the template in string form. Without a chat template, the server will not be able to process chat
-and all chat requests will error.
+The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
+It consists of two endpoints:
+
+- `/tokenize` corresponds to calling `tokenizer.encode()`.
+- `/detokenize` corresponds to calling `tokenizer.decode()`.
+
+(score-api)=
+### Score API
+
+The Score API applies a cross-encoder model to predict scores for sentence pairs.
+Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
+
+You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+#### Single inference
+
+You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
+
+Request:
 
 ```bash
-vllm serve <model> --chat-template ./path-to-chat-template.jinja
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": "What is the capital of France?",
+  "text_2": "The capital of France is Paris."
+}'
 ```
 
-vLLM community provides a set of chat templates for popular models. You can find them in the examples
-directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+Response:
 
-With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
-both a `type` and a `text` field. An example is provided below:
-```python
-completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
-  ]
-)
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
 ```
 
-Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
-`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
-request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
-*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
-the detected format, which can be one of:
+#### Batch inference
 
-- `"string"`: A string.
-  - Example: `"Hello world"`
-- `"openai"`: A list of dictionaries, similar to OpenAI schema.
-  - Example: `[{"type": "text", "text": "Hello world!"}]`
+You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs
+where each pair is built from `text_1` and a string in `text_2`.
+The total number of pairs is `len(text_2)`.
 
-If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
-to override which format to use.
+Request:
 
-## Command line arguments for the server
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "text_1": "What is the capital of France?",
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
+```
 
-```{argparse}
-:module: vllm.entrypoints.openai.cli_args
-:func: create_parser_for_docs
-:prog: vllm serve
+Response:
+
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693570,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 0.001094818115234375
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
 ```
 
+You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
+where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
+The total number of pairs is `len(text_2)`.
+
+Request:
+
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": [
+    "What is the capital of Brazil?",
+    "What is the capital of France?"
+  ],
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
+```
 
-### Config file
+Response:
 
-The `serve` module can also accept arguments from a config file in
-`yaml` format. The arguments in the yaml must be specified using the
-long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server):
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
+```
 
-For example:
+#### Extra parameters
 
-```yaml
-# config.yaml
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
-host: "127.0.0.1"
-port: 6379
-uvicorn-log-level: "info"
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-pooling-params
+:end-before: end-score-pooling-params
 ```
 
-```bash
-$ vllm serve SOME_MODEL --config config.yaml
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-extra-params
+:end-before: end-score-extra-params
 ```
----
-**NOTE**
-In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence.
-The order of priorities is `command line > config file values > defaults`.
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index c93f65327e31b..1e00f26f9a3ba 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -345,12 +345,12 @@ Here is an end-to-end example using VLM2Vec. To serve the model:
 
 .. code-block:: bash
 
-    vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
       --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 
 .. important::
 
-    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
+    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed``
     to run this model in embedding mode instead of text generation mode.
 
     The custom chat template is completely different from the original one for this model,
@@ -386,12 +386,12 @@ Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` mo
 
 .. code-block:: bash
 
-    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
       --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 
 .. important::
 
-    Like with VLM2Vec, we have to explicitly pass ``--task embedding``.
+    Like with VLM2Vec, we have to explicitly pass ``--task embed``.
     
     Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled
     by `this custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja>`__.
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md
index 4c64197975534..2436417cb543a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
@@ -1,45 +1,48 @@
 # Offline Inference with the OpenAI Batch file format
 
- **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API.
- 
- ## File Format
- 
- The OpenAI batch file format consists of a series of json objects on new lines.
+```{important}
+This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
+```
+
+## File Format
  
- [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+The OpenAI batch file format consists of a series of json objects on new lines.
  
- Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
  
- **NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
- ## Pre-requisites
+```{note}
+We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+```
  
-* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`.
+## Pre-requisites
+
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
   - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
   - Install the token on your machine (Run `huggingface-cli login`).
   - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
  
  
- ## Example 1: Running with a local file
- 
- ### Step 1: Create your batch file
- 
- To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+## Example 1: Running with a local file
+
+### Step 1: Create your batch file
+
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
- 
- ### Step 2: Run the batch
+```
+
+### Step 2: Run the batch
  
 The batch running tool is designed to be used from the command line.
 
@@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 ### Step 1: Upload your input script
 
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
+```
 
 Now upload your batch file to your S3 bucket.
 
@@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket.
 aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
-  
 ### Step 2: Generate your presigned urls
 
 Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
@@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 
 ### Step 1: Create your batch file
  
- Add embedding requests to your batch file. The following is an example:
+Add embedding requests to your batch file. The following is an example:
  
- ```
- {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
 ```
- 
- You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
+You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
- ### Step 2: Run the batch
+### Step 2: Run the batch
 
 You can run the batch using the same command as in earlier examples.
 
-
 ### Step 3: Check your results
 
 You can check your results by running `cat results.jsonl`
@@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl`
 ```
 $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
-...```
+...
 ```
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
index fff82020d9a30..a56e7429b7567 100644
--- a/examples/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -99,7 +99,7 @@ def dse_qwen2_vl(inp: dict):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embedding before running this.")
+        "the model with --task embed before running this.")
     parser.add_argument("model",
                         type=str,
                         choices=["vlm2vec", "dse_qwen2_vl"],
diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py
index 8c32eea5dd252..a06af8df5d3fe 100644
--- a/examples/openai_cross_encoder_score.py
+++ b/examples/openai_cross_encoder_score.py
@@ -1,14 +1,15 @@
-"""Examples Python client Score for Cross Encoder Models
 """
+Example online usage of Score API.
 
+Run `vllm serve <model> --task score` to start up the server in vLLM.
+"""
 import argparse
-import json
 import pprint
 
 import requests
 
 
-def post_http_request(prompt: json, api_url: str) -> requests.Response:
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     headers = {"User-Agent": "Test Client"}
     response = requests.post(api_url, headers=headers, json=prompt)
     return response
@@ -20,20 +21,29 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response:
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
     args = parser.parse_args()
-    api_url = f"http://{args.host}:{args.port}/v1/score"
+    api_url = f"http://{args.host}:{args.port}/score"
 
     model_name = args.model
 
+    text_1 = "What is the capital of Brazil?"
+    text_2 = "The capital of Brazil is Brasilia."
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Prompt when text_1 and text_2 are both strings:")
+    pprint.pprint(prompt)
+    print("Score Response:")
+    pprint.pprint(score_response.json())
+
     text_1 = "What is the capital of France?"
     text_2 = [
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 is string and text_2 is a list:")
+    print("Prompt when text_1 is string and text_2 is a list:")
     pprint.pprint(prompt)
     print("Score Response:")
-    pprint.pprint(score_response.data)
+    pprint.pprint(score_response.json())
 
     text_1 = [
         "What is the capital of Brazil?", "What is the capital of France?"
@@ -43,16 +53,7 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response:
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 and text_2 are lists:")
-    pprint.pprint(prompt)
-    print("Score Response:")
-    pprint.pprint(score_response.data)
-
-    text_1 = "What is the capital of Brazil?"
-    text_2 = "The capital of Brazil is Brasilia."
-    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
-    score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 and text_2 are strings:")
+    print("Prompt when text_1 and text_2 are both lists:")
     pprint.pprint(prompt)
     print("Score Response:")
-    pprint.pprint(score_response.data)
\ No newline at end of file
+    pprint.pprint(score_response.json())
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 0698c19ad0023..a803ea4a8d6ad 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -27,7 +27,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
@@ -55,7 +55,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
@@ -78,7 +78,7 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     text_1 = "What is the capital of France?"
     text_2 = "The capital of France is Paris."
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e27224b41864..14e3a34ce141c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -406,7 +406,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/v1/score")
+@router.post("/score")
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
     if handler is None:
@@ -423,6 +423,15 @@ async def create_score(request: ScoreRequest, raw_request: Request):
     assert_never(generator)
 
 
+@router.post("/v1/score")
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly.")
+
+    return await create_score(request, raw_request)
+
+
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 34c9f0a96216f..f4e7740ea0cff 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -812,10 +812,11 @@ class ScoreRequest(OpenAIBaseModel):
     text_2: Union[List[str], str]
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
 
-    # doc: begin-chat-embedding-pooling-params
+    # doc: begin-score-pooling-params
     additional_data: Optional[Any] = None
-    # doc: end-chat-embedding-pooling-params
+    # doc: end-score-pooling-params
 
+    # doc: begin-score-extra-params
     priority: int = Field(
         default=0,
         description=(
@@ -823,6 +824,8 @@ class ScoreRequest(OpenAIBaseModel):
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."))
 
+    # doc: end-score-extra-params
+
     def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 8c6c1aca3a917..2ecdf74ee59b3 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,12 +1,11 @@
 import time
-import warnings
 from dataclasses import dataclass
 from typing import Dict, Generic, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
 import torch
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
@@ -73,13 +72,11 @@ def __eq__(self, other: object) -> bool:
             (self.data == other.data).all()))
 
     @property
+    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
+                "attribute. To return embeddings, use `LLM.embed()`. "
+                "To return class probabilities, use `LLM.classify()` "
+                "and access the `probs` attribute. ")
     def embedding(self) -> list[float]:
-        msg = ("`LLM.encode()` now returns raw outputs. "
-               "To return embeddings, use `LLM.embed()`. "
-               "To return class probabilities, use `LLM.classify()` "
-               "and access the `probs` attribute. ")
-        warnings.warn(msg, DeprecationWarning, stacklevel=2)
-
         return self.data.tolist()
 
 
@@ -491,11 +488,9 @@ def __repr__(self) -> str:
         return f"ScoringOutput(score={self.score})"
 
     @property
+    @deprecated("`LLM.score()` now returns scalar scores. "
+                "Please access it via the `score` attribute. ")
     def embedding(self) -> list[float]:
-        msg = ("`LLM.score()` now returns scalar scores. "
-               "Please access it via the `score` attribute. ")
-        warnings.warn(msg, DeprecationWarning, stacklevel=2)
-
         return [self.score]
 
 

From 0a56bcc03de0857be464c3f8783258d590cbc762 Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Fri, 13 Dec 2024 20:00:40 +0200
Subject: [PATCH 39/85] [Bugfix][Hardware][CPU] Enable Gemma2 with SDPA on CPU
 backend (#11169)

---
 vllm/attention/backends/torch_sdpa.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 86e952a903f36..0cff6f5952aba 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,7 +13,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad
+from vllm.utils import make_tensor_with_pad, print_warning_once
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
 
@@ -395,7 +395,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError("Torch SPDA does not support logits soft cap.")
+            print_warning_once("Torch SPDA does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -619,7 +620,7 @@ def _run_sdpa_forward(
                 value[None, :, start_kv:end_kv, :],
                 attn_mask=mask,
                 dropout_p=0.0,
-                is_causal=causal_attn and not self.need_mask,
+                is_causal=causal_attn and mask is None,
                 scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
             output[start_q:end_q, :, :] = sub_out
             start_q, start_kv = end_q, end_kv

From 0d8451c3a45d309e58de5e1c546f043de461d478 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Fri, 13 Dec 2024 12:17:37 -0800
Subject: [PATCH 40/85] [Distributed] Allow the placement group more time to
 wait for resources to be ready (#11138)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
---
 vllm/executor/ray_utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4f28efd639084..426aa1b5c728f 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -277,10 +277,14 @@ def initialize_ray_cluster(
                 f"Total number of devices: {device_bundles}.")
     else:
         num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        # Log a warning message and delay resource allocation failure response.
+        # Avoid immediate rejection to allow user-initiated placement group
+        # created and wait cluster to be ready
         if parallel_config.world_size > num_devices_in_cluster:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group.")
+            logger.warning(
+                "The number of required %ss exceeds the total "
+                "number of available %ss in the placement group.", device_str,
+                device_str)
         # Create a new placement group
         placement_group_specs: List[Dict[str, float]] = ([{
             device_str: 1.0

From 4863e5fba51b8e1a5012e2a7582aece0ca575b89 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 13 Dec 2024 19:27:32 -0500
Subject: [PATCH 41/85] [Core] V1: Use multiprocessing by default (#11074)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/source/design/multiprocessing.md     | 195 ++++++++++++++++++++++
 docs/source/getting_started/debugging.rst |  56 +++++++
 docs/source/index.rst                     |   1 +
 vllm/entrypoints/llm.py                   |   4 +
 vllm/envs.py                              |   4 +-
 vllm/executor/multiproc_worker_utils.py   |  20 ++-
 vllm/v1/engine/core.py                    |   8 +-
 vllm/v1/engine/core_client.py             |  11 +-
 vllm/v1/engine/llm_engine.py              |   7 +
 vllm/v1/executor/multiproc_executor.py    |  10 +-
 10 files changed, 299 insertions(+), 17 deletions(-)
 create mode 100644 docs/source/design/multiprocessing.md

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
new file mode 100644
index 0000000000000..b58456ecc6da8
--- /dev/null
+++ b/docs/source/design/multiprocessing.md
@@ -0,0 +1,195 @@
+# Python Multiprocessing
+
+## Debugging
+
+Please see the [Debugging
+Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing)
+page for information on known issues and how to solve them.
+
+## Introduction
+
+*Note that source code references are to the state of the code at the time of writing in December, 2024.*
+
+The use of Python multiprocessing in vLLM is complicated by:
+
+- The use of vLLM as a library and the inability to control the code using vLLM
+- Varying levels of incompatibilities between multiprocessing methods and vLLM
+  dependencies
+
+This document describes how vLLM deals with these challenges.
+
+## Multiprocessing Methods
+
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+
+- `spawn` - spawn a new Python process. This will be the default as of Python
+  3.14.
+
+- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
+  in Python versions prior to 3.14.
+
+- `forkserver` - Spawn a server process that will fork a new process on request.
+
+### Tradeoffs
+
+`fork` is the fastest method, but is incompatible with dependencies that use
+threads.
+
+`spawn` is more compatible with dependencies, but can be problematic when vLLM
+is used as a library. If the consuming code does not use a `__main__` guard (`if
+__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
+spawns a new process. This can lead to infinite recursion, among other problems.
+
+`forkserver` will spawn a new server process that will fork new processes on
+demand. This unfortunately has the same problem as `spawn` when vLLM is used as
+a library. The server process is created as a spawned new process, which will
+re-execute code not protected by a `__main__` guard.
+
+For both `spawn` and `forkserver`, the process must not depend on inheriting any
+global state as would be the case with `fork`.
+
+## Compatibility with Dependencies
+
+Multiple vLLM dependencies indicate either a preference or requirement for using
+`spawn`:
+
+- <https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing>
+- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
+- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
+
+It is perhaps more accurate to say that there are known problems with using
+`fork` after initializing these dependencies.
+
+## Current State (v0)
+
+The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
+
+When we know we own the process because the `vllm` command was used, we use
+`spawn` because it's the most widely compatible.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
+
+The `multiproc_xpu_executor` forces the use of `spawn`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
+
+There are other miscellaneous places hard-coding the use of `spawn`:
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
+
+Related PRs:
+
+- <https://github.com/vllm-project/vllm/pull/8823>
+
+## Prior State in v1
+
+There was an environment variable to control whether multiprocessing is used in
+the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L452-L454>
+
+When it was enabled, the v1 `LLMEngine` would create a new process to run the
+engine core.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
+- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45
+
+It was off by default for all the reasons mentioned above - compatibility with
+dependencies and code using vLLM as a library.
+
+### Changes Made in v1
+
+There is not an easy solution with Python's `multiprocessing` that will work
+everywhere. As a first step, we can get v1 into a state where it does "best
+effort" choice of multiprocessing method to maximize compatibility.
+
+- Default to `fork`.
+- Use `spawn` when we know we control the main process (`vllm` was executed).
+- If we detect `cuda` was previously initialized, force `spawn` and emit a
+  warning. We know `fork` will break, so this is the best we can do.
+
+The case that is known to still break in this scenario is code using vLLM as a
+library that initializes `cuda` before calling vLLM. The warning we emit should
+instruct users to either add a `__main__` guard or to disable multiprocessing.
+
+If that known-failure case occurs, the user will see two messages that explain
+what is happening. First, a log message from vLLM:
+
+```
+    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+      initialized. We must use the `spawn` multiprocessing start method. Setting
+      VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+      https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+      for more information.
+```
+
+Second, Python itself will raise an exception with a nice explanation:
+
+```
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+## Alternatives Considered
+
+### Detect if a `__main__` guard is present
+
+It has been suggested that we could behave better if we could detect whether
+code using vLLM as a library has a `__main__` guard in place. This [post on
+stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
+was from a library author facing the same question.
+
+It is possible to detect whether we are in the original, `__main__` process, or
+a subsequent spawned process. However, it does not appear to be straight forward
+to detect whether a `__main__` guard is present in the code.
+
+This option has been discarded as impractical.
+
+### Use `forkserver`
+
+At first it appears that `forkserver` is a nice solution to the problem.
+However, the way it works presents the same challenges that `spawn` does when
+vLLM is used as a library.
+
+### Force `spawn` all the time
+
+One way to clean this up is to just force the use of `spawn` all the time and
+document that the use of a `__main__` guard is required when using vLLM as a
+library. This would unfortunately break existing code and make vLLM harder to
+use, violating the desire to make the `LLM` class as easy as possible to use.
+
+Instead of pushing this on our users, we will retain the complexity to do our
+best to make things work.
+
+## Future Work
+
+We may want to consider a different worker management approach in the future
+that works around these challenges.
+
+1. We could implement something `forkserver`-like, but have the process manager
+   be something we initially launch by running our own subprocess and a custom
+   entrypoint for worker management (launch a `vllm-manager` process).
+
+2. We can explore other libraries that may better suit our needs. Examples to
+   consider:
+
+- <https://github.com/joblib/loky>
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 0c1afcbd7c0b9..d6c83014dc69f 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -136,6 +136,62 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b
 
     Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
 
+Python multiprocessing
+----------------------
+
+`RuntimeError` Exception
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you have seen a warning in your logs like this:
+
+.. code-block:: console
+
+    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+        initialized. We must use the `spawn` multiprocessing start method. Setting
+        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+        https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+        for more information.
+
+or an error from Python that looks like this:
+
+.. code-block:: console
+
+    RuntimeError:
+            An attempt has been made to start a new process before the
+            current process has finished its bootstrapping phase.
+
+            This probably means that you are not using fork to start your
+            child processes and you have forgotten to use the proper idiom
+            in the main module:
+
+                if __name__ == '__main__':
+                    freeze_support()
+                    ...
+
+            The "freeze_support()" line can be omitted if the program
+            is not going to be frozen to produce an executable.
+
+            To fix this issue, refer to the "Safe importing of main module"
+            section in https://docs.python.org/3/library/multiprocessing.html
+
+then you must update your Python code to guard usage of ``vllm`` behind a ``if
+__name__ == '__main__':`` block. For example, instead of this:
+
+.. code-block:: python
+
+    import vllm
+
+    llm = vllm.LLM(...)
+
+try this instead:
+
+.. code-block:: python
+
+    if __name__ == '__main__':
+        import vllm
+
+        llm = vllm.LLM(...)
+
 Known Issues
 ----------------------------------------
 - In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 842013d6d49c4..8ac09f6988893 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -173,6 +173,7 @@ Documentation
    design/input_processing/model_inputs_index
    design/kernel/paged_attention
    design/multimodal/multimodal_index
+   design/multiprocessing
 
 .. For Developers: contributing to the vLLM project
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 11b2574ce42dd..58ab892676b9a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -232,6 +232,10 @@ def __init__(
 
         self.request_counter = Counter()
 
+    def __del__(self):
+        if self.llm_engine and hasattr(self.llm_engine, "shutdown"):
+            self.llm_engine.shutdown()
+
     @staticmethod
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
diff --git a/vllm/envs.py b/vllm/envs.py
index bc8c1499e9534..da17b747ea215 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -69,7 +69,7 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
-    VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
 
 
@@ -460,7 +460,7 @@ def get_default_config_root():
 
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))),
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
     "VLLM_LOG_BATCHSIZE_INTERVAL":
     lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
 }
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index fe475db6d3f57..c4d90f0856f86 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -274,7 +274,20 @@ def write_with_prefix(s: str):
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                       "See https://docs.vllm.ai/en/latest/getting_started/"
+                       "debugging.html#python-multiprocessing "
+                       "for more information.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
 def get_mp_context():
+    _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
 
@@ -284,12 +297,7 @@ def set_multiprocessing_worker_envs(parallel_config):
     in a multiprocessing environment. This should be called by the parent 
     process before worker processes are created"""
 
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+    _check_multiproc_method()
 
     # Configure thread parallelism if OMP_NUM_THREADS isn't set
     #
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6246a0067842a..ee7419bce2565 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,4 +1,3 @@
-import multiprocessing
 import pickle
 import queue
 import signal
@@ -13,6 +12,7 @@
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler
@@ -210,11 +210,7 @@ def make_engine_core_process(
         output_path: str,
         ready_path: str,
     ) -> EngineCoreProcHandle:
-        # The current process might have CUDA context,
-        # so we need to spawn a new process.
-        # NOTE(rob): this is a problem for using EngineCoreProc w/
-        # LLM, since we need a if __name__ == "__main__" guard.
-        context = multiprocessing.get_context("spawn")
+        context = get_mp_context()
 
         process_kwargs = {
             "input_path": input_path,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a66ae111be8c5..e0bfe1b93b360 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -159,10 +159,16 @@ def __init__(
         atexit.register(self.shutdown)
 
     def shutdown(self):
+        # During final garbage collection in process shutdown, atexit may be
+        # None.
+        if atexit:
+            # in case shutdown gets called via __del__ first
+            atexit.unregister(self.shutdown)
+
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
-        if hasattr(self, "proc_handle"):
+        if hasattr(self, "proc_handle") and self.proc_handle:
             # Shutdown the process if needed.
             if self.proc_handle.proc.is_alive():
                 self.proc_handle.proc.terminate()
@@ -178,8 +184,9 @@ def shutdown(self):
             ]
             for ipc_socket in ipc_sockets:
                 socket_file = ipc_socket.replace("ipc://", "")
-                if os.path.exists(socket_file):
+                if os and os.path.exists(socket_file):
                     os.remove(socket_file)
+            self.proc_handle = None
 
     def __del__(self):
         self.shutdown()
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1b3a9f12d009e..c02494897b41f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -196,3 +196,10 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 63a12f791051f..14384a730ceec 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -163,6 +163,10 @@ def _ensure_worker_termination(self):
         termination and kill signals if needed."""
 
         def wait_for_termination(procs, timeout):
+            if not time:
+                # If we are in late stage shutdown, the interpreter may replace
+                # `time` with `None`.
+                return all(not proc.is_alive() for proc in procs)
             start_time = time.time()
             while time.time() - start_time < timeout:
                 if all(not proc.is_alive() for proc in procs):
@@ -187,10 +191,14 @@ def _cleanup_sockets(self):
         for w in self.workers:
             # Remove the zmq ipc socket file
             socket_path = w.ready_path.replace("ipc://", "")
-            if os.path.exists(socket_path):
+            if os and os.path.exists(socket_path):
                 os.remove(socket_path)
 
     def shutdown(self):
+        if atexit:
+            # in case shutdown was called explicitly, we don't need to call it
+            # again
+            atexit.unregister(self.shutdown)
         """Properly shut down the executor and its workers"""
         if (hasattr(self, 'workers') and self.workers is not None):
             for w in self.workers:  #TODO: not sure if needed

From 4b5b8a6a3bd94d9b0248b36b0eb4739d76fbb386 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 13 Dec 2024 20:02:35 -0500
Subject: [PATCH 42/85] [V1][Bugfix] Fix EngineCoreProc profile (#11185)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ee7419bce2565..dc8c1d39eefa9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -321,7 +321,7 @@ def _handle_client_request(
         if isinstance(request, EngineCoreRequest):
             self.add_request(request)
         elif isinstance(request, EngineCoreProfile):
-            self.model_executor.worker.profile(request.is_start)
+            self.model_executor.profile(request.is_start)
         else:
             # TODO: make an EngineCoreAbort wrapper
             assert isinstance(request, list)

From 9855aea21b6aec48b12cef3a1614e7796b970a73 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 13 Dec 2024 17:08:23 -0800
Subject: [PATCH 43/85] [Bugfix][V1] Re-compute an entire block when fully
 cache hit (#11186)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/v1/core/scheduler.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index a3e85c20cc664..f055eed77c372 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -199,9 +199,13 @@ def schedule(self) -> "SchedulerOutput":
                 if num_new_tokens == 0:
                     # The happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
-                    # the last token.
-                    num_computed_tokens -= 1
-                    num_new_tokens = 1
+                    # the last block. Note that we have to re-compute an entire
+                    # block because allocate_slots() assumes num_computed_tokens
+                    # is always a multiple of the block size. This limitation
+                    # can potentially be removed in the future to slightly
+                    # improve the performance.
+                    num_computed_tokens -= self.block_size
+                    num_new_tokens = self.block_size
                     computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0

From 24a3d12b821a081850c1659f61762e799eeba902 Mon Sep 17 00:00:00 2001
From: dhuangnm <74931910+dhuangnm@users.noreply.github.com>
Date: Fri, 13 Dec 2024 22:22:44 -0500
Subject: [PATCH 44/85] update compressed-tensors to latest version (#11183)

Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 11984260c580d..71c5b122d7c42 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -33,5 +33,5 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.0 # required for compressed-tensors
+compressed-tensors == 0.8.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging torch.compile

From 48259264a4012e756215adc87e3682bf1e7dfee9 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 14 Dec 2024 02:46:18 -0500
Subject: [PATCH 45/85] [Core] Update outlines and increase its threadpool size
 (#11140)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 requirements-common.txt                               |  2 +-
 .../guided_decoding/outlines_decoding.py              | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 71c5b122d7c42..bd2b4b7a01668 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.9
+outlines == 0.1.11
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 8a7ff38bfeb1a..eb8db882435e6 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,5 +1,6 @@
 import asyncio
 import concurrent.futures
+import os
 from enum import Enum
 from json import dumps as json_dumps
 from re import escape as regex_escape
@@ -48,6 +49,11 @@ class GuidedDecodingMode(Enum):
 
 global_thread_pool = None  # used for generating logits processor fsm
 
+# It's not yet clear that using more provides a benefit, and it could
+# potentially starve other processes on the machine. We'll cap this for now and
+# adjust later if testing proves it to help overcome a bottleneck.
+_MAX_THREADPOOL_WORKERS = 16
+
 
 async def get_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
@@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
         return None
 
     if global_thread_pool is None:
+        max_workers = os.cpu_count() or 2
+        if max_workers > _MAX_THREADPOOL_WORKERS:
+            max_workers = _MAX_THREADPOOL_WORKERS
         global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=2)
+            max_workers=max_workers)
     loop = asyncio.get_running_loop()
 
     return await loop.run_in_executor(global_thread_pool,

From ea7bd68d101884165ffd75c1fd6e94a97510f194 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 14 Dec 2024 03:21:23 -0500
Subject: [PATCH 46/85] [V1][Bugfix] Fix V1 TP trust-remote-code (#11182)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index dc8c1d39eefa9..af644fb5fedba 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -14,6 +14,8 @@
 from vllm.config import CacheConfig, VllmConfig
 from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
@@ -241,6 +243,9 @@ def run_engine_core(*args, **kwargs):
         # processes to terminate without error
         shutdown_requested = False
 
+        # Ensure we can serialize transformer config after spawning
+        maybe_register_config_serialize_by_value()
+
         def signal_handler(signum, frame):
             nonlocal shutdown_requested
             if not shutdown_requested:

From 3cb5769883fa104e42248f2b3f41a310947f357c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 15 Dec 2024 00:38:27 +0800
Subject: [PATCH 47/85] [Misc] Minor improvements to the readability of
 PunicaWrapperBase (#11200)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/punica_wrapper/punica_base.py | 14 +++++-----
 vllm/lora/punica_wrapper/punica_gpu.py  | 34 ++++++++++++-------------
 vllm/lora/punica_wrapper/punica_hpu.py  |  4 +--
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 0a5a84bdd8deb..b9ec0c4bc6323 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -63,7 +63,7 @@ def add_expand(
         lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
         output_slices: Tuple[int, ...],
         offset_start: int = 0,
-        add_input=True,
+        add_inputs=True,
         **kwargs,
     ) -> None:
         """
@@ -77,7 +77,7 @@ def add_lora_embedding(
         y: torch.Tensor,
         x: torch.Tensor,
         lora_b_stacked: torch.Tensor,
-        add_input: bool = True,
+        add_inputs: bool = True,
         **kwargs,
     ) -> None:
         """
@@ -367,12 +367,13 @@ def add_expand(self,
                    lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
                    output_slices: Tuple[int, ...],
                    offset_start: int = 0,
-                   add_input=True,
+                   add_inputs=True,
                    **kwargs) -> None:
         """
         Performs GEMM and bias addition for multiple slices of lora_b.
       
         Semantics:
+            offset = offset_start
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
                 y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
@@ -386,7 +387,8 @@ def add_expand(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_input (bool):  Defaults to True.
+            offset_start (int): The starting position of y, defaults to 0
+            add_inputs (bool):  Defaults to True.
 
         """
         # TODO: implement it based on torch ops
@@ -397,7 +399,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
@@ -409,7 +411,7 @@ def add_lora_embedding(self,
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_b_stacked (torch.Tensor): lora_b's weights.
-            add_input (bool): Default to True.
+            add_inputs (bool): Default to True.
         """
         # TODO: implement it based on torch ops
         raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index b2af29de129ce..de378df8b3cfa 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -67,7 +67,7 @@ def _expand_prefill(
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
-        add_input: bool,
+        add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
@@ -77,7 +77,7 @@ def _expand_prefill(
             w_t_all,
             y,
             *self.prefill_metadata,
-            add_input,
+            add_inputs,
         )
 
     def _expand_decode(
@@ -85,9 +85,9 @@ def _expand_decode(
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
-        add_input: bool,
+        add_inputs: bool,
     ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
 
     def _expand_slice_prefill(
         self,
@@ -96,7 +96,7 @@ def _expand_slice_prefill(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool,
+        add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
@@ -108,7 +108,7 @@ def _expand_slice_prefill(
             *self.prefill_metadata,
             y_offset,
             y_slice_size,
-            add_input,
+            add_inputs,
         )
 
     def _expand_slice_decode(
@@ -118,10 +118,10 @@ def _expand_slice_decode(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool,
+        add_inputs: bool,
     ):
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_input)
+                          y_slice_size, add_inputs)
 
     def _apply_expand(
         self,
@@ -130,7 +130,7 @@ def _apply_expand(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool = True,
+        add_inputs: bool = True,
     ):
         """
         Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
@@ -141,7 +141,7 @@ def _apply_expand(
         expand_slice_fun: Callable = (self._expand_slice_prefill
                                       if self.is_prefill else
                                       self._expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
 
     def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
                       w_t_all: torch.Tensor, scale: float):
@@ -194,7 +194,7 @@ def add_expand(self,
                    lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
                    output_slices: Tuple[int, ...],
                    offset_start: int = 0,
-                   add_input=True,
+                   add_inputs=True,
                    **kwargs) -> None:
         """
         Performs GEMM and bias addition for multiple slices of lora_b.
@@ -213,7 +213,7 @@ def add_expand(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_input (bool):  Defaults to True.
+            add_inputs (bool):  Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
@@ -228,7 +228,7 @@ def add_expand(self,
                 lora_b_stacked[slice_idx],
                 offset_left,
                 output_slices[slice_idx],
-                add_input=add_input,
+                add_inputs=add_inputs,
             )
             offset_left += output_slices[slice_idx]
         y = y.view_as(y_org)
@@ -237,7 +237,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
@@ -249,13 +249,13 @@ def add_lora_embedding(self,
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_b_stacked (torch.Tensor): lora_b's weights.
-            add_input (bool): Default to True.
+            add_inputs (bool): Default to True.
         """
 
         # Embedding layer only need expand op
         expand_fun: Callable = (self._expand_prefill
                                 if self.is_prefill else self._expand_decode)
-        expand_fun(y, x, lora_b_stacked, add_input)
+        expand_fun(y, x, lora_b_stacked, add_inputs)
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -311,7 +311,7 @@ def add_lora_linear(self,
                         lora_b_stacked,
                         None,
                         output_slices,
-                        add_input=True,
+                        add_inputs=True,
                         **kwargs)
 
     def add_lora_logits(self,
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index 996325b712996..d9c4f44a1c282 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -21,7 +21,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
 
@@ -81,7 +81,7 @@ def add_expand(
         lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
         output_slices: Tuple[int, ...],
         offset_start: int = 0,
-        add_input=True,
+        add_inputs=True,
         **kwargs,
     ) -> None:
         raise NotImplementedError

From 9c3dadd1c97df2b37388c6898a0725457391f647 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Sat, 14 Dec 2024 09:46:42 -0700
Subject: [PATCH 48/85] [Frontend] Add `logits_processors` as an extra
 completion argument (#11150)

Signed-off-by: Brad Hilton <brad.hilton.nw@gmail.com>
---
 tests/entrypoints/openai/test_serving_chat.py |  1 +
 vllm/config.py                                | 71 +++++++++--------
 vllm/engine/arg_utils.py                      | 11 ++-
 vllm/entrypoints/openai/protocol.py           | 77 ++++++++++++++++++-
 vllm/entrypoints/openai/serving_chat.py       |  3 +-
 vllm/entrypoints/openai/serving_completion.py |  3 +-
 6 files changed, 127 insertions(+), 39 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 93660e6118ca8..5b40a04db15ee 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -30,6 +30,7 @@ class MockModelConfig:
     tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
+    logits_processor_pattern = None
 
 
 @dataclass
diff --git a/vllm/config.py b/vllm/config.py
index 12ed80c366e43..37d062f7eb079 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -156,41 +156,45 @@ class ModelConfig:
             can not be gathered from the vllm arguments.
         override_pooler_config: Initialize non default pooling config or
             override default pooling config for the pooling model.
+        logits_processor_pattern: Optional regex pattern specifying valid
+            logits processor qualified names that can be passed with the
+            `logits_processors` extra completion argument. Defaults to None, 
+            which allows no processors.
     """
 
-    def __init__(
-            self,
-            model: str,
-            task: Union[TaskOption, Literal["draft"]],
-            tokenizer: str,
-            tokenizer_mode: str,
-            trust_remote_code: bool,
-            dtype: Union[str, torch.dtype],
-            seed: int,
-            allowed_local_media_path: str = "",
-            revision: Optional[str] = None,
-            code_revision: Optional[str] = None,
-            rope_scaling: Optional[Dict[str, Any]] = None,
-            rope_theta: Optional[float] = None,
-            tokenizer_revision: Optional[str] = None,
-            max_model_len: Optional[int] = None,
-            spec_target_max_model_len: Optional[int] = None,
-            quantization: Optional[str] = None,
-            quantization_param_path: Optional[str] = None,
-            enforce_eager: Optional[bool] = None,
-            max_seq_len_to_capture: Optional[int] = None,
-            max_logprobs: int = 20,
-            disable_sliding_window: bool = False,
-            skip_tokenizer_init: bool = False,
-            served_model_name: Optional[Union[str, List[str]]] = None,
-            limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
-            use_async_output_proc: bool = True,
-            config_format: ConfigFormat = ConfigFormat.AUTO,
-            hf_overrides: Optional[HfOverrides] = None,
-            mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-            mm_cache_preprocessor: bool = False,
-            override_neuron_config: Optional[Dict[str, Any]] = None,
-            override_pooler_config: Optional["PoolerConfig"] = None) -> None:
+    def __init__(self,
+                 model: str,
+                 task: Union[TaskOption, Literal["draft"]],
+                 tokenizer: str,
+                 tokenizer_mode: str,
+                 trust_remote_code: bool,
+                 dtype: Union[str, torch.dtype],
+                 seed: int,
+                 allowed_local_media_path: str = "",
+                 revision: Optional[str] = None,
+                 code_revision: Optional[str] = None,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 rope_theta: Optional[float] = None,
+                 tokenizer_revision: Optional[str] = None,
+                 max_model_len: Optional[int] = None,
+                 spec_target_max_model_len: Optional[int] = None,
+                 quantization: Optional[str] = None,
+                 quantization_param_path: Optional[str] = None,
+                 enforce_eager: Optional[bool] = None,
+                 max_seq_len_to_capture: Optional[int] = None,
+                 max_logprobs: int = 20,
+                 disable_sliding_window: bool = False,
+                 skip_tokenizer_init: bool = False,
+                 served_model_name: Optional[Union[str, List[str]]] = None,
+                 limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+                 use_async_output_proc: bool = True,
+                 config_format: ConfigFormat = ConfigFormat.AUTO,
+                 hf_overrides: Optional[HfOverrides] = None,
+                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+                 mm_cache_preprocessor: bool = False,
+                 override_neuron_config: Optional[Dict[str, Any]] = None,
+                 override_pooler_config: Optional["PoolerConfig"] = None,
+                 logits_processor_pattern: Optional[str] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -316,6 +320,7 @@ def __init__(
         self.task: Final = task
 
         self.pooler_config = self._init_pooler_config(override_pooler_config)
+        self.logits_processor_pattern = logits_processor_pattern
 
         self._verify_quantization()
         self._verify_cuda_graph()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0098648b1cd60..5a73c6ee02e0c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -170,6 +170,7 @@ class EngineArgs:
     enable_chunked_prefill: Optional[bool] = None
 
     guided_decoding_backend: str = 'xgrammar'
+    logits_processor_pattern: Optional[str] = None
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
     speculative_model_quantization: Optional[str] = None
@@ -374,6 +375,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
             ' parameter.')
+        parser.add_argument(
+            '--logits-processor-pattern',
+            type=nullable_str,
+            default=None,
+            help='Optional regex pattern specifying valid logits processor '
+            'qualified names that can be passed with the `logits_processors` '
+            'extra completion argument. Defaults to None, which allows no '
+            'processors.')
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
@@ -975,7 +984,7 @@ def create_model_config(self) -> ModelConfig:
             mm_cache_preprocessor=self.mm_cache_preprocessor,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
-        )
+            logits_processor_pattern=self.logits_processor_pattern)
 
     def create_load_config(self) -> LoadConfig:
         return LoadConfig(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f4e7740ea0cff..dfb7c977dbd43 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,5 +1,6 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import re
 import time
 from argparse import Namespace
 from typing import Any, Dict, List, Literal, Optional, Union
@@ -14,7 +15,7 @@
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
 from vllm.sequence import Logprob
-from vllm.utils import random_uuid
+from vllm.utils import random_uuid, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -148,6 +149,46 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
 
 
+class LogitsProcessorConstructor(BaseModel):
+    qualname: str
+    args: Optional[List[Any]] = None
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+LogitsProcessors = List[Union[str, LogitsProcessorConstructor]]
+
+
+def get_logits_processors(processors: Optional[LogitsProcessors],
+                          pattern: Optional[str]) -> Optional[List[Any]]:
+    if processors and pattern:
+        logits_processors = []
+        for processor in processors:
+            qualname = processor if isinstance(processor,
+                                               str) else processor.qualname
+            if not re.match(pattern, qualname):
+                raise ValueError(
+                    f"Logits processor '{qualname}' is not allowed by this "
+                    "server. See --logits-processor-pattern engine argument "
+                    "for more information.")
+            try:
+                logits_processor = resolve_obj_by_qualname(qualname)
+            except Exception as e:
+                raise ValueError(
+                    f"Logits processor '{qualname}' could not be resolved: {e}"
+                ) from e
+            if isinstance(processor, LogitsProcessorConstructor):
+                logits_processor = logits_processor(*processor.args or [],
+                                                    **processor.kwargs or {})
+            logits_processors.append(logits_processor)
+        return logits_processors
+    elif processors:
+        raise ValueError(
+            "The `logits_processors` argument is not supported by this "
+            "server. See --logits-processor-pattern engine argugment "
+            "for more information.")
+    return None
+
+
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -293,6 +334,17 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
             "through out the inference process and return in response."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-chat-completion-extra-params
 
@@ -314,7 +366,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
@@ -364,6 +418,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             min_tokens=self.min_tokens,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             include_stop_str_in_output=self.include_stop_str_in_output,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
@@ -599,6 +655,17 @@ class CompletionRequest(OpenAIBaseModel):
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-completion-extra-params
 
@@ -619,7 +686,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
@@ -665,6 +734,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a5e7b4ac3bb30..527418c635093 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -197,7 +197,8 @@ async def create_chat_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b3436773062f3..bd39a4c42e938 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -123,7 +123,8 @@ async def create_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 request_id_item = f"{request_id}-{i}"
 

From 93abf23a648051fe6dc053ba0b74499d119920bf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 15 Dec 2024 01:52:18 +0800
Subject: [PATCH 49/85] [VLM] Fully dynamic prompt replacement in merged input
 processor (#11199)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py |   5 +-
 .../mm_processor_kwargs/test_phi3v.py         |   4 +-
 tests/multimodal/test_processing.py           | 105 +--
 .../vllm_add_dummy_model/my_llava.py          |   4 +-
 vllm/inputs/registry.py                       |  71 +-
 vllm/model_executor/models/llava.py           | 144 ++---
 vllm/model_executor/models/phi3v.py           | 118 ++--
 vllm/model_executor/models/pixtral.py         |   2 +-
 vllm/multimodal/base.py                       |   4 +-
 vllm/multimodal/processing.py                 | 606 +++++++++---------
 vllm/multimodal/registry.py                   |   4 +-
 vllm/utils.py                                 |  12 +-
 12 files changed, 569 insertions(+), 510 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index c430f42fdc814..45539c665a922 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -97,9 +97,6 @@ def run_phi3v(question: str, modality: str):
     # max_model_len (128k) for this model may cause OOM.
     # You may lower either to run this example on lower-end GPUs.
 
-    # In this example, we override max_num_seqs to 5 while
-    # keeping the original context length of 128k.
-
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
     # to use 16 for single frame scenarios, and 4 for multi-frame.
@@ -113,7 +110,7 @@ def run_phi3v(question: str, modality: str):
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model="microsoft/Phi-3-vision-128k-instruct",
+        model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         max_model_len=4096,
         max_num_seqs=2,
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index c16192a1e1438..ce8ac8d8e0ceb 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -16,8 +16,8 @@
 # Wrap lazy imports to avoid initializing CUDA during test collection
 @pytest.fixture()
 def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VProcessor
-    return Phi3VProcessor
+    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
+    return Phi3VMultiModalProcessor
 
 
 @pytest.fixture()
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index ae668d1dd56c8..6aaa80ddc9fa5 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,11 +1,11 @@
 from typing import cast
 
 import pytest
-from transformers import BatchFeature
 
-from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
-                                        find_text_matches, find_token_matches,
-                                        iter_placeholders, iter_token_matches,
+from vllm.multimodal.processing import (MultiModalDataItems, PromptReplacement,
+                                        _PlaceholderInfo, find_text_matches,
+                                        find_token_matches, iter_placeholders,
+                                        iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -16,7 +16,7 @@
 @pytest.mark.parametrize(
     ("token_ids", "match_ids", "expected"),
     [
-        ([], [], [{ "start_idx": 0, "end_idx": 0 }]),
+        ([], [], []),
         ([], [32000], []),
         (
             [32000, 32000, 32000],
@@ -83,7 +83,7 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                 "pattern_2": [32000],
             },
             {
-                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
+                "pattern_1": [],
                 "pattern_2": [],
             }
         ),
@@ -136,7 +136,7 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     result = find_token_matches(prompt, prompt_repls)
@@ -243,7 +243,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     result = find_text_matches(prompt, prompt_repls)
@@ -276,12 +276,12 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
                 "pattern_3": "!",
             },
             {
-                # Test whether target is confused with repl_unit
-                "pattern_1": ("<image><image>", 1),
-                # Test empty repl_unit
-                "pattern_2": ("", 1),
-                # Test multiple repl_count
-                "pattern_3": ("?", 2),
+                # Test whether target is confused with replacement
+                "pattern_1": "<image><image>",
+                # Test empty replacement
+                "pattern_2": "",
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": "?!?",
             },
         ),
     ]
@@ -290,8 +290,8 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
     ("mm_count", "expected"),
     [
         (0, "Image:<image>Image:<image><image>!"),
-        (1, "<image><image>Image:<image><image>??"),
-        (2, "<image><image><image><image><image>??"),
+        (1, "<image><image>Image:<image><image>?!?"),
+        (2, "<image><image><image><image><image>?!?"),
     ]
 )
 # yapf: enable
@@ -306,7 +306,7 @@ def test_find_replace_text(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     matches = find_text_matches(prompt, prompt_repls)
@@ -314,9 +314,8 @@ def test_find_replace_text(
     result = replace_text_matches(
         prompt,
         matches,
-        {key: list(range(mm_count))
-         for key in repl_by_key},
-        BatchFeature(),
+        MultiModalDataItems({key: [None] * mm_count
+                             for key in repl_by_key}),
     )
 
     # Only displayed on error
@@ -343,12 +342,12 @@ def test_find_replace_text(
                 "pattern_3": [918],
             },
             {
-                # Test whether target is confused with repl_unit
-                "pattern_1": ([32000, 32000], 1),
-                # Test empty repl_unit
-                "pattern_2": ([], 1),
-                # Test multiple repl_count
-                "pattern_3": ([1550], 2),
+                # Test whether target is confused with replacement
+                "pattern_1": [32000, 32000],
+                # Test empty replacement
+                "pattern_2": [],
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": [1550, 918, 1550],
             },
         ),
     ]
@@ -357,8 +356,8 @@ def test_find_replace_text(
     ("mm_count", "expected"),
     [
         (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
-        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550]),
-        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550]),
+        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
+        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
     ]
 )
 # yapf: enable
@@ -373,7 +372,7 @@ def test_find_replace_tokens(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     matches = find_token_matches(prompt, prompt_repls)
@@ -381,9 +380,8 @@ def test_find_replace_tokens(
     result = replace_token_matches(
         prompt,
         matches,
-        {key: list(range(mm_count))
-         for key in repl_by_key},
-        BatchFeature(),
+        MultiModalDataItems({key: [None] * mm_count
+                             for key in repl_by_key}),
     )
 
     # Only displayed on error
@@ -399,9 +397,9 @@ def test_find_replace_tokens(
     "repl_by_key",
     [
         {
-            "pattern_1": ([32000, 32000], 1),
-            "pattern_2": ([], 1),
-            "pattern_3": ([1550], 2),
+            "pattern_1": [32000, 32000],
+            "pattern_2": [],
+            "pattern_3": [1550, 918, 1550],
         },
     ],
 )
@@ -414,48 +412,47 @@ def test_find_replace_tokens(
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=6,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
             ],
         ),
         (
-            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550],
+            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             [
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=1,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=5,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_3",
                     start_idx=7,
-                    unit=[1550],
-                    unit_count=2,
+                    replacement=[1550, 918, 1550],
                 ),
             ],
         ),
         (
-            [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550],
+            [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             [
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=1,
-                    unit=[32000, 32000],
-                    unit_count=2,
+                    replacement=[32000, 32000],
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=3,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_3",
                     start_idx=6,
-                    unit=[1550],
-                    unit_count=2,
+                    replacement=[1550, 918, 1550],
                 ),
             ],
         ),
@@ -470,11 +467,17 @@ def test_iter_placeholders(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement([], *repl).bind(key, mock_tokenizer)
+        PromptReplacement(key, [], repl).bind(mock_tokenizer)
         for key, repl in repl_by_key.items()
     ]
 
-    result = list(iter_placeholders(prompt_repls, prompt))
+    result = list(
+        iter_placeholders(
+            prompt_repls,
+            prompt,
+            # Effectively match all occurrences in the prompt
+            MultiModalDataItems({key: [None] * 3 for key in repl_by_key}),
+         ))
 
     # Only displayed on error
     print("result:", result)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 2f4194a63fc25..0d90635093ac7 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -3,14 +3,14 @@
 import torch
 
 from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaProcessor,
+                                              LlavaMultiModalProcessor,
                                               get_max_llava_image_tokens)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0dfed3b7e61bf..0b85484c48714 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -2,7 +2,7 @@
 from collections import UserDict
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
-                    Optional, Protocol, Type, cast)
+                    Optional, Protocol, Type)
 
 from torch import nn
 from transformers import PretrainedConfig, ProcessorMixin
@@ -47,7 +47,6 @@ def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
         Raises:
             TypeError: If the model is not of the specified type.
         """
-
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, hf_config_type):
             raise TypeError("Invalid type of HuggingFace config. "
@@ -60,21 +59,70 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
         """
         Get the HuggingFace image processor configuration of the model.
         """
-
         return self.model_config.hf_image_processor_config
 
+    def get_mm_config(self):
+        """
+        Get the multimodal config of the model.
+
+        Raises:
+            RuntimeError: If the model is not a multimodal model.
+        """
+        mm_config = self.model_config.multimodal_config
+        if mm_config is None:
+            raise RuntimeError("Not a multimodal model")
+
+        return mm_config
+
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
+        return cached_get_processor(
+            self.model_config.model,
+            trust_remote_code=self.model_config.trust_remote_code,
+            **merged_kwargs,
+        )
+
 
 @dataclass(frozen=True)
 class InputProcessingContext(InputContext):
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
-    def get_hf_processor(self, **kwargs) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
         return cached_get_processor(
-            self.model_config.tokenizer,
+            self.model_config.model,
             tokenizer=self.tokenizer,  # Override the tokenizer with ours
             trust_remote_code=self.model_config.trust_remote_code,
-            **kwargs)
+            **merged_kwargs,
+        )
+
+    def resolve_hf_processor_call_kwargs(
+        self,
+        hf_processor: ProcessorMixin,
+        inference_kwargs: Mapping[str, object],
+    ) -> Mapping[str, object]:
+        assert callable(hf_processor)
+
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        return resolve_mm_processor_kwargs(
+            base_kwargs,
+            inference_kwargs,
+            hf_processor,
+        )
 
 
 N = TypeVar("N", bound=Type[nn.Module])
@@ -171,7 +219,8 @@ def register_dummy_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
+            if self._dummy_factories_by_model_type.contains(model_cls,
+                                                            strict=True):
                 logger.warning(
                     "Model class %s already has dummy data "
                     "registered to %s. It is overwritten by the new one.",
@@ -195,7 +244,8 @@ def register_dummy_encoder_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_encoder_factories_by_model_type:
+            if self._dummy_encoder_factories_by_model_type.contains(
+                    model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has dummy encoder data "
                     "registered to %s. It is overwritten by the new one.",
@@ -305,7 +355,8 @@ def register_input_processor(self, processor: InputProcessor):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors_by_model_type:
+            if self._input_processors_by_model_type.contains(model_cls,
+                                                             strict=True):
                 logger.warning(
                     "Model class %s already has input processor "
                     "registered to %s. It is overwritten by the new one.",
@@ -357,7 +408,7 @@ def process_input(self, model_config: "ModelConfig",
         # If it's empty, it'll fall back to the default kwarg values
         mm_processor_kwargs = resolve_mm_processor_kwargs(
             model_config.mm_processor_kwargs,
-            cast(Dict[str, Any], inputs.get("mm_processor_kwargs")),
+            inputs.get("mm_processor_kwargs", {}),  # type: ignore
             processor,
         )
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 53eef72dd5f91..a2e404cf43238 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -5,10 +5,10 @@
 
 import torch
 import torch.nn as nn
-from PIL.Image import Image
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
                           ProcessorMixin, SiglipVisionConfig)
+from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
 from vllm.attention import AttentionMetadata
@@ -21,11 +21,9 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ModalityProcessingMetadata,
-                                        MultiModalProcessingMetadata,
+                                        MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
@@ -33,7 +31,8 @@
                    get_max_clip_image_tokens)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf,
-                      get_max_pixtral_hf_image_tokens)
+                      get_max_pixtral_hf_image_tokens,
+                      get_pixtral_hf_image_feature_size)
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      get_max_siglip_image_tokens)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
@@ -115,62 +114,7 @@ def get_max_llava_image_tokens(ctx: InputContext):
         raise ValueError(f"Unexpected select feature strategy: {strategy}")
 
 
-def dummy_mm_kwargs_for_llava(ctx: InputProcessingContext,
-                              mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        data = dummy_image_for_clip(vision_config, num_images)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        data = dummy_image_for_siglip(vision_config, num_images)
-    elif isinstance(vision_config, PixtralVisionConfig):
-        data = dummy_image_for_pixtral_hf(vision_config, num_images)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    hf_processor = ctx.get_hf_processor()
-    image_processor = hf_processor.image_processor  # type: ignore
-    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
-    is_pixtral = isinstance(hf_processor, PixtralProcessor)
-
-    return MultiModalKwargs(
-        **hf_inputs,
-        is_pixtral=torch.tensor(is_pixtral),
-    )
-
-
-def create_metadata_for_llava(
-        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    image_token_id = hf_config.image_token_index
-
-    def get_repl_count(
-        mm_items: list[Image],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> int:
-        return get_max_llava_image_tokens(ctx)
-
-    return {
-        "image":
-        ModalityProcessingMetadata(prompt_repls=[
-            PromptReplacement(target=[image_token_id],
-                              repl_unit=[image_token_id],
-                              repl_count=get_repl_count),
-        ]),
-    }
-
-
-class LlavaProcessor(BaseMultiModalProcessor):
-
-    def __init__(self, ctx: InputProcessingContext) -> None:
-        super().__init__(
-            ctx=ctx,
-            metadata=create_metadata_for_llava(ctx),
-        )
+class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
     def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
         if getattr(hf_processor, "__is_patched__", False):
@@ -188,18 +132,72 @@ def preprocess(__self, *args, **kwargs):
 
         hf_processor.__is_patched__ = True  # type: ignore
 
-    def _get_hf_processor(self) -> ProcessorMixin:
+    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
         hf_processor = self.ctx.get_hf_processor()
+        assert isinstance(hf_processor, (LlavaProcessor, PixtralProcessor))
 
         if isinstance(hf_processor, PixtralProcessor):
             self._patch_pixtral_processor(hf_processor)
 
         return hf_processor
 
-    def _get_dummy_mm_kwargs(
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        image_token_id = hf_config.image_token_index
+
+        processor = self._get_hf_processor()
+        if isinstance(processor, PixtralProcessor):
+            image_token = processor.image_token
+            image_break_token = processor.image_break_token
+            image_end_token = processor.image_end_token
+
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+
+            def get_replacement_pixtral(item_idx: int):
+                image_size = mm_items.get_image_size(item_idx)
+                (
+                    num_width_tokens,
+                    num_height_tokens,
+                ) = get_pixtral_hf_image_feature_size(
+                    vision_config,
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+                tokens = ([image_token] * num_width_tokens +
+                          [image_break_token]) * num_height_tokens
+                tokens[-1] = image_end_token
+
+                return "".join(tokens)
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=get_replacement_pixtral,
+                ),
+            ]
+
+        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
+
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
+    ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         vision_config = hf_config.vision_config
         num_images = mm_counts["image"]
@@ -215,11 +213,13 @@ def _get_dummy_mm_kwargs(
             raise NotImplementedError(msg)
 
         hf_processor = self._get_hf_processor()
-        image_processor = hf_processor.image_processor  # type: ignore
-        hf_inputs = image_processor.preprocess(data['image'],
-                                               return_tensors="pt")
+        image_token = hf_processor.image_token
 
-        return MultiModalKwargs(**hf_inputs)
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 class LlavaLikeConfig(Protocol):
@@ -303,7 +303,7 @@ def init_vision_tower_for_llava(
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -584,7 +584,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loader.load_weights(weights)
 
 
-class MantisProcessor(LlavaProcessor):
+class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def _get_hf_processor(self) -> ProcessorMixin:
         try:
@@ -604,6 +604,6 @@ def _get_hf_processor(self) -> ProcessorMixin:
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(MantisProcessor)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 3c7854ce388ab..7ab06768ae612 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,13 +32,10 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ModalityProcessingMetadata,
                                         MultiModalDataDict,
-                                        MultiModalProcessingMetadata,
+                                        MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -305,64 +302,17 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-def get_max_phi3v_image_tokens(ctx: InputContext,
-                               *,
-                               num_crops: Optional[int] = None):
-    mm_processor_kwargs = {}
-    if num_crops is not None:
-        mm_processor_kwargs["num_crops"] = num_crops
+def get_max_phi3v_image_tokens(ctx: InputContext) -> int:
+    processor = ctx.get_hf_processor()
+    image_processor = processor.image_processor  # type: ignore
 
-    model_config = ctx.model_config
-    image_processor = cached_get_image_processor(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        **mm_processor_kwargs,
-    )
-
-    num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+    return image_processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
         height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
-    return num_tokens
-
-
-def dummy_mm_kwargs_for_phi3v(ctx: InputProcessingContext,
-                              mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-
-    data = dummy_image_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        num_images,
-        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-    )
-
-    hf_processor = ctx.get_hf_processor()
-    image_processor = hf_processor.image_processor  # type: ignore
-    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
-
-    return MultiModalKwargs(**hf_inputs)
-
-
-def create_metadata_for_phi3v(
-        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
-    return {
-        "image":
-        ModalityProcessingMetadata(prompt_repls=[
-            PromptReplacement(target=[_IMAGE_TOKEN_ID],
-                              repl_unit=[_IMAGE_TOKEN_ID],
-                              repl_count=get_max_phi3v_image_tokens(ctx)),
-        ]),
-    }
-
 
-class Phi3VProcessor(BaseMultiModalProcessor):
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
-        super().__init__(
-            ctx=ctx,
-            metadata=create_metadata_for_phi3v(ctx),
-        )
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
     def _get_hf_processor(
         self,
@@ -389,15 +339,61 @@ def _apply_hf_processor(
         processed_outputs['input_ids'] = token_ids
         return processed_outputs
 
-    def _get_dummy_mm_kwargs(
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+        image_processor = hf_processor.image_processor  # type: ignore
+
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+
+        def get_replacement_phi3v(item_idx: int):
+            image_size = mm_items.get_image_size(item_idx)
+            num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+                width=image_size.width,
+                height=image_size.height,
+            )
+
+            return [_IMAGE_TOKEN_ID] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_phi3v,
+            ) for image_token in image_tokens[:max_images]
+        ]
+
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
-        return dummy_mm_kwargs_for_phi3v(self.ctx, mm_counts)
+    ) -> ProcessorInputs:
+        num_images = mm_counts["image"]
+
+        data = dummy_image_for_clip(
+            CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+            num_images,
+            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        )
+
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text="".join(image_tokens[:num_images]),
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(Phi3VProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 94a4ab882c1a9..161d6b41bfa5f 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -72,7 +72,7 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
     image_token_id = mm_encoder.special_ids.img
 
-    mm_config = ctx.model_config.multimodal_config
+    mm_config = ctx.get_mm_config()
     num_images = mm_config.limit_per_prompt.get("image", 1)
 
     # dummy size
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7dba94b885b6d..fe77a4635f7d8 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -99,7 +99,7 @@ def register_input_mapper(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_mappers:
+            if self._input_mappers.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
@@ -194,7 +194,7 @@ def register_max_multimodal_tokens(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._max_mm_tokens:
+            if self._max_mm_tokens.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already calculates maximum number of "
                     "tokens in %s. It is overwritten by the new one.",
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 922c83b6fd8a9..de5a002d474c2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,116 +1,59 @@
 import re
 from abc import ABC, abstractmethod
+from collections import UserDict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import (Any, Dict, Generic, NamedTuple, Optional, Protocol,
-                    TypeVar, Union, cast)
+from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
+import numpy as np
 import torch
+from PIL.Image import Image
 from transformers import BatchFeature, ProcessorMixin
-from typing_extensions import TypeAlias, TypedDict
+from typing_extensions import assert_never
 
 from vllm.inputs import DummyData, InputProcessingContext
+from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import (flatten_2d_lists, full_groupby, is_list_of,
-                        resolve_mm_processor_kwargs)
+from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
 
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                      MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
                      VideoItem)
 
+logger = init_logger(__name__)
 
-def bind_prompt_sequence(
-    seq: Union[str, list[int]],
-    tokenizer: AnyTokenizer,
-) -> "_BoundPromptSequence":
-    """
-    Bind a text or token sequence to a tokenizer so that it can be
-    lazily converted into the other format on demand.
-    """
-    return _BoundPromptSequence(
-        tokenizer=tokenizer,
-        _text=seq if isinstance(seq, str) else None,
-        _token_ids=seq if isinstance(seq, list) else None,
-    )
-
-
-_T = TypeVar("_T")
 _S = TypeVar("_S", str, list[int])
+_PromptSeq = Union[str, list[int]]
 
 
 @dataclass
-class PromptReplacement(Generic[_S, _T]):
-    target: _S
-    """The text or token sequence to find and replace."""
+class PromptReplacement:
+    modality: str
+    """The modality for which the replacement is made"""
 
-    repl_unit: _S
-    """
-    The unit making up the replacement text or token sequence.
-    
-    See :code:`repl_count` for more details.
-    """
+    target: _PromptSeq
+    """The text or token sequence to find and replace."""
 
-    repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int]
+    replacement: Union[Callable[[int], _PromptSeq],
+                       _PromptSeq] = field(repr=False)
     """
-    Given the original multi-modal items for this modality, HF-processed data,
-    and index of the processed item, output the number of repetitions of
-    :code:`repl_unit` to build up the replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`, output the
+    replacement text or token sequence.
 
-    For convenience, you can pass in an integer if the number of repetitions is
-    a constant.
+    For convenience, you can pass in the replacement instead of a function
+    if it does not depend on the input.
     """
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(target={self.target!r}, "
-                f"repl_unit={self.repl_unit!r})")
-
-    def bind(
-        self,
-        modality: str,
-        tokenizer: AnyTokenizer,
-    ) -> "_BoundPromptReplacement[_T]":
+    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
         return _BoundPromptReplacement(
-            modality=modality,
-            target=bind_prompt_sequence(self.target, tokenizer),
-            repl_unit=bind_prompt_sequence(self.repl_unit, tokenizer),
-            repl_count=self.repl_count,
+            tokenizer=tokenizer,
+            modality=self.modality,
+            _target=self.target,
+            _replacement=self.replacement,
         )
 
 
-@dataclass
-class ModalityProcessingMetadata(Generic[_T]):
-    prompt_repls: Sequence[Union[PromptReplacement[str, _T],
-                                 PromptReplacement[list[int], _T]]]
-    """
-    Defines each text or token sequence to replace in the HF-processed prompt.
-
-    This is skipped if the HF-processed prompt is found to already contain
-    the replacement prompts.
-    """
-
-
-class MultiModalProcessingMetadataBuiltins(TypedDict, total=False):
-    """Type annotations for modality types predefined by vLLM."""
-
-    image: ModalityProcessingMetadata[ImageItem]
-    video: ModalityProcessingMetadata[VideoItem]
-    audio: ModalityProcessingMetadata[AudioItem]
-
-
-MultiModalProcessingMetadata: TypeAlias = \
-    Mapping[str, ModalityProcessingMetadata[Any]]
-"""
-A dictionary containing an entry for each modality type to process.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
-"""
-
-
 def _encode(
     tokenizer: AnyTokenizer,
     text: str,
@@ -185,7 +128,8 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
 
 @dataclass
 class _BoundPromptSequence:
-    tokenizer: AnyTokenizer
+    tokenizer: AnyTokenizer = field(repr=False)
+
     _text: Optional[str]
     _token_ids: Optional[list[int]]
 
@@ -210,38 +154,92 @@ def token_ids(self) -> list[int]:
 
         return self._token_ids
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(_text={self._text!r}, "
-                f"_token_ids={self._token_ids!r})")
-
 
 @dataclass
-class _BoundPromptReplacement(Generic[_T]):
+class _BoundPromptReplacement:
+    tokenizer: AnyTokenizer = field(repr=False)
     modality: str
-    target: _BoundPromptSequence
-    repl_unit: _BoundPromptSequence
-    repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int]
 
-    def get_count(
-        self,
-        mm_items: list[_T],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> int:
-        repl_count = self.repl_count
-        if isinstance(repl_count, int):
-            return repl_count
+    _target: _PromptSeq
+    _replacement: Union[Callable[[int], _PromptSeq],
+                        _PromptSeq] = field(repr=False)
 
-        return repl_count(mm_items, hf_inputs, item_idx)
+    def __post_init__(self) -> None:
+        self._replacement_cache = dict[int, _BoundPromptSequence]()
+
+    @property
+    def target(self) -> _BoundPromptSequence:
+        target = self._target
 
+        return _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=target if isinstance(target, str) else None,
+            _token_ids=target if isinstance(target, list) else None,
+        )
 
-def to_multi_format(data: MultiModalDataDict) -> dict[str, list[Any]]:
+    def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        replacement = self._replacement
+        if callable(replacement):
+            cache_key = item_idx
+            if cache_key in self._replacement_cache:
+                return self._replacement_cache[cache_key]
+
+            replacement = replacement(item_idx)
+        else:
+            cache_key = None
+
+        bound_replacement = _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=replacement if isinstance(replacement, str) else None,
+            _token_ids=replacement if isinstance(replacement, list) else None,
+        )
+
+        if cache_key is not None:
+            self._replacement_cache[cache_key] = bound_replacement
+
+        return bound_replacement
+
+
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class MultiModalDataItems(UserDict[str, list[Any]]):
     """
-    Convert a :class:`MultiModalDataDict` containing single data items
-    to a :class:`MultiModalMultiDataDict` containing multiple data items
-    per entry.
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
     """
-    multi_data = dict[str, list[Any]]()
+
+    @property
+    def image(self) -> list[ImageItem]:
+        return self["image"]
+
+    @property
+    def video(self) -> list[VideoItem]:
+        return self["video"]
+
+    @property
+    def audio(self) -> list[AudioItem]:
+        return self["audio"]
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.image[item_idx]
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems:
+    """
+    Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+    """
+    multi_data = MultiModalDataItems()
 
     for k, v in data.items():
         # yapf: disable
@@ -266,22 +264,33 @@ def iter_token_matches(
     token_ids: list[int],
     match_ids: list[int],
 ) -> Iterable[_TokenMatch]:
-    """Yield each occurrence of :code:`match_ids` in :code:`token_ids`."""
+    """
+    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
+
+    Note that empty matches are ignored.
+    """
+    prompt_len = len(token_ids)
     match_len = len(match_ids)
 
-    last_end_idx = 0
-    for start_idx in range(len(token_ids) - match_len + 1):
-        if start_idx < last_end_idx:
-            continue  # Exclude overlapping matches
+    if match_len == 0:
+        return
 
+    start_idx = 0
+    while start_idx < prompt_len - match_len + 1:
         end_idx = start_idx + match_len
+
         if token_ids[start_idx:end_idx] == match_ids:
             yield _TokenMatch(start_idx=start_idx, end_idx=end_idx)
-            last_end_idx = end_idx
+
+            # Exclude overlapping matches
+            start_idx = end_idx
+        else:
+            start_idx += 1
 
 
-class _PromptReplacementMatch(ABC, Generic[_T, _S]):
-    prompt_repl: _BoundPromptReplacement[_T]
+@dataclass(repr=False)
+class _PromptReplacementMatch(ABC):
+    prompt_repl: _BoundPromptReplacement
 
     @property
     def modality(self) -> str:
@@ -297,19 +306,13 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         raise NotImplementedError
 
-    @property
-    @abstractmethod
-    def repl_unit(self) -> _S:
-        raise NotImplementedError
-
     def __repr__(self) -> str:
         return (f"{type(self).__name__}(modality={self.modality!r}, "
                 f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
 
 
 @dataclass(repr=False)
-class _PromptReplacementTokenMatch(_PromptReplacementMatch[_T, list[int]]):
-    prompt_repl: _BoundPromptReplacement[_T]
+class _PromptReplacementTokenMatch(_PromptReplacementMatch):
     match: _TokenMatch
 
     @property
@@ -320,14 +323,9 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end_idx
 
-    @property
-    def repl_unit(self) -> list[int]:
-        return self.prompt_repl.repl_unit.token_ids
-
 
 @dataclass(repr=False)
-class _PromptReplacementTextMatch(_PromptReplacementMatch[_T, str]):
-    prompt_repl: _BoundPromptReplacement[_T]
+class _PromptReplacementTextMatch(_PromptReplacementMatch):
     match: re.Match[str]
 
     @property
@@ -338,20 +336,15 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end()
 
-    @property
-    def repl_unit(self) -> str:
-        return self.prompt_repl.repl_unit.text
-
 
 class _PlaceholderInfo(NamedTuple):
     modality: str
     start_idx: int
-    unit: list[int]
-    unit_count: int
+    replacement: list[int]
 
     @property
     def length(self) -> int:
-        return len(self.unit) * self.unit_count
+        return len(self.replacement)
 
     def to_range(self) -> PlaceholderRange:
         return PlaceholderRange(
@@ -362,8 +355,8 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[_BoundPromptReplacement[_T]],
-) -> list[_PromptReplacementTokenMatch[_T]]:
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTokenMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
         _PromptReplacementTokenMatch(prompt_repl, match)
@@ -374,8 +367,8 @@ def find_token_matches(
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[_BoundPromptReplacement[_T]],
-) -> list[_PromptReplacementTextMatch[_T]]:
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTextMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
         _PromptReplacementTextMatch(prompt_repl, match)
@@ -385,15 +378,15 @@ def find_text_matches(
 
 
 def _resolve_matches(
-    prompt: _S,
-    matches: Sequence[_PromptReplacementMatch[_T, _S]],
-) -> list[_PromptReplacementMatch[_T, _S]]:
+    prompt: _PromptSeq,
+    matches: Sequence[_PromptReplacementMatch],
+) -> list[_PromptReplacementMatch]:
     """
     Resolve :code:`matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
-    seen_matches: list[Optional[_PromptReplacementMatch[_T, _S]]] \
-        = [None] * len(prompt)
+    seen_matches: list[Optional[_PromptReplacementMatch]] = [None
+                                                             ] * len(prompt)
 
     for match in matches:
         for idx in range(match.start_idx, match.end_idx):
@@ -409,30 +402,34 @@ def _resolve_matches(
 
 def _replace_matches(
     prompt: _S,
-    matches: Sequence[_PromptReplacementMatch[_T, _S]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementMatch],
+    mm_items: MultiModalDataItems,
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_items_by_modality}
+    next_idx_by_modality = {modality: 0 for modality in mm_items}
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
-        mm_items = mm_items_by_modality[modality]
+        modal_items = mm_items[modality]
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= len(mm_items):
+        if item_idx >= len(modal_items):
             continue
 
         start_idx = match.start_idx
         end_idx = match.end_idx
-        repl_unit = match.repl_unit
+
         repl_info = match.prompt_repl
-        repl_count = repl_info.get_count(mm_items, hf_inputs, item_idx)
+        replacement = repl_info.get_replacement(item_idx)
+
+        if isinstance(prompt, str):
+            repl_seq = replacement.text
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+        else:
+            repl_seq = replacement.token_ids
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
 
-        out_seqs.append(prompt[prev_end_idx:start_idx] +
-                        repl_unit * repl_count)
         prev_end_idx = end_idx
         next_idx_by_modality[modality] += 1
 
@@ -443,92 +440,104 @@ def _replace_matches(
 
 def replace_token_matches(
     prompt: list[int],
-    matches: Sequence[_PromptReplacementMatch[_T, list[int]]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementTokenMatch],
+    mm_items: MultiModalDataItems,
 ) -> list[int]:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    token_id_seqs = _replace_matches(
-        prompt,
-        matches,
-        mm_items_by_modality,
-        hf_inputs,
-    )
+    token_id_seqs = _replace_matches(prompt, matches, mm_items)
 
     return flatten_2d_lists(token_id_seqs)
 
 
 def replace_text_matches(
     prompt: str,
-    matches: Sequence[_PromptReplacementMatch[_T, str]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementTextMatch],
+    mm_items: MultiModalDataItems,
 ) -> str:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    texts = _replace_matches(
-        prompt,
-        matches,
-        mm_items_by_modality,
-        hf_inputs,
-    )
+    texts = _replace_matches(prompt, matches, mm_items)
 
     return "".join(texts)
 
 
-def _merge_placeholder_matches(
-    matches: Iterable[_PromptReplacementTokenMatch],
-) -> Iterable[_PromptReplacementTokenMatch]:
-    current_match = None
+def _iter_modality_placeholders(
+    prompt: list[int],
+    modality: str,
+    modality_repls: Sequence[_BoundPromptReplacement],
+    modal_items: list[Any],
+) -> Iterable[_PlaceholderInfo]:
+    if len(modal_items) == 0:
+        return
 
-    for match in sorted(matches, key=lambda x: x.start_idx):
-        if current_match is None:
-            current_match = match
-        elif (current_match.prompt_repl == match.prompt_repl
-              and current_match.end_idx == match.start_idx):
-            current_match = _PromptReplacementTokenMatch(
-                current_match.prompt_repl,
-                match=_TokenMatch(current_match.start_idx, match.end_idx),
-            )
-        else:
-            yield current_match
-            current_match = match
+    prompt_len = len(prompt)
+    item_index = 0
+
+    start_idx = 0
+    while start_idx < prompt_len:
+        found = False
+
+        for repl_info in modality_repls:
+            replacement = repl_info.get_replacement(item_index)
+            repl_tokens = replacement.token_ids
+            repl_len = len(repl_tokens)
+            end_idx = start_idx + repl_len
+
+            if repl_len == 0 or end_idx > prompt_len:
+                continue
 
-    if current_match is not None:
-        yield current_match
+            if prompt[start_idx:end_idx] == repl_tokens:
+                yield _PlaceholderInfo(
+                    modality=modality,
+                    start_idx=start_idx,
+                    replacement=repl_tokens,
+                )
+
+                item_index += 1
+                if item_index >= len(modal_items):
+                    return
+
+                # Exclude overlapping matches
+                start_idx = end_idx
+                found = True
+                break
+
+        if not found:
+            start_idx += 1
 
 
 def iter_placeholders(
-    prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+    prompt_repls: Sequence[_BoundPromptReplacement],
     prompt: list[int],
-    *,
-    min_unit_count: int = 1,
+    mm_items: MultiModalDataItems,
 ) -> Iterable[_PlaceholderInfo]:
-    """Yield each set of placeholder tokens found in :code:`token_ids`."""
-    if min_unit_count <= 0:
-        raise ValueError("`min_unit_count` must be a positive integer")
-
-    matches = (_PromptReplacementTokenMatch(prompt_repl, match)
-               for prompt_repl in prompt_repls
-               if len(repl_unit := prompt_repl.repl_unit.token_ids) > 0
-               for match in iter_token_matches(prompt, repl_unit))
-
-    for match in _merge_placeholder_matches(matches):
-        unit = match.repl_unit
-        placeholder = _PlaceholderInfo(
-            modality=match.modality,
-            start_idx=match.start_idx,
-            unit=unit,
-            unit_count=(match.end_idx - match.start_idx) // len(unit),
-        )
+    """
+    Yield each set of placeholder tokens found in :code:`prompt`.
+
+    Note that empty matches are ignored.
+    """
+    repls_by_modality = dict(full_groupby_modality(prompt_repls))
+
+    for modality, modal_items in mm_items.items():
+        if modality in repls_by_modality:
+            yield from _iter_modality_placeholders(
+                prompt,
+                modality,
+                repls_by_modality[modality],
+                modal_items,
+            )
+
 
-        if placeholder.unit_count >= min_unit_count:
-            yield placeholder
+class ProcessorInputs(NamedTuple):
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`"""
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    mm_processor_kwargs: Mapping[str, object]
 
 
 class BaseMultiModalProcessor(ABC):
@@ -536,52 +545,55 @@ class BaseMultiModalProcessor(ABC):
     Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
-    def __init__(
-        self,
-        ctx: InputProcessingContext,
-        metadata: MultiModalProcessingMetadata,
-    ) -> None:
+    def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
 
         self.ctx = ctx
-        self.metadata = metadata
-        self.init_mm_processor_kwargs = (ctx.model_config.mm_processor_kwargs
-                                         or {})
 
-    def _get_hf_processor(
+    def __call__(
         self,
-        **mm_processor_kwargs: Mapping[str, object],
-    ) -> ProcessorMixin:
-        # by default, we won't pass any kwargs to the processor initialization
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        return self.apply(prompt, mm_data, mm_processor_kwargs)
+
+    def _get_hf_processor(self) -> ProcessorMixin:
+        """
+        Subclasses can add keyword arguments to this method to accept
+        additional kwargs from model config or user inputs.
+        """
         return self.ctx.get_hf_processor()
 
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def __call__(
+    @abstractmethod
+    def _get_prompt_replacements(
         self,
-        prompt: str,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
         mm_processor_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
-        return self.apply(prompt, mm_data, mm_processor_kwargs)
+    ) -> list[PromptReplacement]:
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the replacements to perform.
+
+        Note:
+            Even when the HF processor already performs replacement for us,
+            we still use this replacement information to determine
+            the placeholder token positions for each multi-modal item.
+        """
+        raise NotImplementedError
 
     def _find_placeholders(
         self,
-        all_prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+        all_prompt_repls: Sequence[_BoundPromptReplacement],
         new_token_ids: list[int],
-        *,
-        # To avoid false positives from multi-input when detecting
-        # whether placeholder tokens have been inserted, in case
-        # the target sequence is a subset of the replacement tokens
-        min_unit_count: int = 16,
+        mm_items: MultiModalDataItems,
     ) -> list[_PlaceholderInfo]:
         return list(
-            iter_placeholders(
-                all_prompt_repls,
-                new_token_ids,
-                min_unit_count=min_unit_count,
-            ))
+            iter_placeholders(all_prompt_repls, new_token_ids, mm_items))
 
     def _apply_hf_processor(
         self,
@@ -589,13 +601,7 @@ def _apply_hf_processor(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # some mm_processor_kwargs may be used in processor initialization
-        # instead of processor call
-        processor_init_kwargs = {
-            **self.init_mm_processor_kwargs,
-            **mm_processor_kwargs,
-        }
-        hf_processor = self._get_hf_processor(**processor_init_kwargs)
+        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
 
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
@@ -615,11 +621,10 @@ def _apply_hf_processor(
             else:
                 processor_data[k] = v
 
-        # filter mm_processor_kwargs used in processor call
-        mm_processor_kwargs = resolve_mm_processor_kwargs(
-            self.init_mm_processor_kwargs,
-            cast(Dict[str, Any], mm_processor_kwargs),
+        assert callable(hf_processor)
+        mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
             hf_processor,
+            mm_processor_kwargs,
         )
 
         try:
@@ -642,26 +647,21 @@ def _apply_hf_processor(
 
     def _bind_prompt_replacements(
         self,
-        mm_data: MultiModalDataDict,
-    ) -> list[_BoundPromptReplacement[Any]]:
+        prompt_repls: list[PromptReplacement],
+    ) -> list[_BoundPromptReplacement]:
         tokenizer = self._get_tokenizer()
 
-        return [
-            prompt_repl.bind(modality, tokenizer)
-            for modality, metadata in self.metadata.items()
-            if modality in mm_data for prompt_repl in metadata.prompt_repls
-        ]
+        return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
 
     def _apply_prompt_replacements(
         self,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
         hf_inputs: BatchFeature,
         token_ids: list[int],
-        prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+        prompt_repls: Sequence[_BoundPromptReplacement],
     ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
         tokenizer = self._get_tokenizer()
 
-        mm_items = to_multi_format(mm_data)
         token_matches = find_token_matches(token_ids, prompt_repls)
 
         # If the search text does not represent a special token,
@@ -682,7 +682,6 @@ def _apply_prompt_replacements(
                 token_ids,
                 token_matches,
                 mm_items,
-                hf_inputs,
             )
 
             text = _decode(tokenizer, token_ids)
@@ -695,13 +694,13 @@ def _apply_prompt_replacements(
                 text,
                 text_matches,
                 mm_items,
-                hf_inputs,
             )
 
             token_ids = _encode(tokenizer, text)
             matched_repls = [match.prompt_repl for match in text_matches]
 
-        placeholders = self._find_placeholders(matched_repls, token_ids)
+        placeholders = self._find_placeholders(matched_repls, token_ids,
+                                               mm_items)
 
         return token_ids, text, placeholders
 
@@ -731,12 +730,16 @@ def apply(
         prompt_ids, = hf_inputs.pop("input_ids").tolist()
         mm_kwargs = MultiModalKwargs(hf_inputs)
 
-        all_prompt_repls = self._bind_prompt_replacements(mm_data)
+        mm_items = to_multi_format(mm_data)
+        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
+                                                     mm_processor_kwargs)
+        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids)
+                                                   prompt_ids, mm_items)
+
         if all_placeholders:
             prompt_text = _decode(tokenizer, prompt_ids)
         else:
@@ -745,7 +748,7 @@ def apply(
                 prompt_text,
                 all_placeholders,
             ) = self._apply_prompt_replacements(
-                mm_data,
+                mm_items,
                 hf_inputs,
                 prompt_ids,
                 all_prompt_repls,
@@ -765,13 +768,13 @@ def apply(
         )
 
     @abstractmethod
-    def _get_dummy_mm_kwargs(
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
+    ) -> ProcessorInputs:
         """
-        Build the input that corresponds to `mm_max_tokens` in
-        :meth:`get_dummy_data`.
+        Build the multi-modal portion of the input which, after processing,
+        results in `mm_max_tokens` in :meth:`get_dummy_data`.
         """
         raise NotImplementedError
 
@@ -784,38 +787,41 @@ def get_dummy_data(
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        tokenizer = self._get_tokenizer()
-
-        mm_placeholders = dict[str, _PlaceholderInfo]()
-        offset = 0
-
-        for modality, max_tokens in mm_max_tokens.items():
-            if max_tokens == 0:
-                continue
-
-            metadata = self.metadata[modality]
-            repl = metadata.prompt_repls[0].bind(modality, tokenizer)
-            repl_token_ids = repl.repl_unit.token_ids
-
-            placeholders = _PlaceholderInfo(
-                modality=modality,
-                start_idx=offset,
-                unit=repl_token_ids,
-                unit_count=max_tokens // len(repl_token_ids),
-            )
-
-            mm_placeholders[modality] = placeholders
-            offset += placeholders.length
+        processor_inputs = self._get_dummy_mm_inputs(mm_counts)
+        mm_inputs = self.apply(*processor_inputs)
+
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = dict[str, int]()
+        for modality, placeholders in placeholders_by_modality.items():
+            num_placeholders = sum(item["length"] for item in placeholders)
+            max_tokens = mm_max_tokens[modality]
+
+            if num_placeholders != max_tokens:
+                logger.warning(
+                    "The processed dummy data has a total of %d placeholder "
+                    "tokens for the '%s' modality, which is not the expected "
+                    "%d tokens.", num_placeholders, modality, max_tokens)
+
+            total_placeholders_by_modality[modality] = num_placeholders
+
+        total_len = len(prompt_token_ids)
+        if total_len > seq_len:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
 
-        prompt_token_ids = flatten_2d_lists(
-            [p.unit * p.unit_count for p in mm_placeholders.values()])
         prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
 
         return DummyData(
             seq_data=SequenceData.from_seqs(prompt_token_ids),
-            multi_modal_data=self._get_dummy_mm_kwargs(mm_counts),
-            multi_modal_placeholders={
-                modality: [p.to_range()]
-                for modality, p in mm_placeholders.items()
-            },
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
         )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 6ab6c0fe2f12e..03f8814a95356 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -299,9 +299,9 @@ def register_processor(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._processor_factories:
+            if self._processor_factories.contains(model_cls, strict=True):
                 logger.warning(
-                    "Model class %s already has an input mapper "
+                    "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
diff --git a/vllm/utils.py b/vllm/utils.py
index fbc3ef7fa7f89..45e682ac15782 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1370,8 +1370,8 @@ def supports_kw(
 
 
 def resolve_mm_processor_kwargs(
-    init_kwargs: Optional[Dict[str, Any]],
-    inference_kwargs: Optional[Dict[str, Any]],
+    init_kwargs: Optional[Mapping[str, object]],
+    inference_kwargs: Optional[Mapping[str, object]],
     callable: Callable[..., object],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
@@ -1405,7 +1405,7 @@ def resolve_mm_processor_kwargs(
 
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
-    overrides: Optional[Dict[str, Any]],
+    overrides: Optional[Mapping[str, object]],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """
@@ -1524,9 +1524,15 @@ def __getitem__(self, key: Type[T]) -> _V:
         raise KeyError(key)
 
     def __contains__(self, key: object) -> bool:
+        return self.contains(key)
+
+    def contains(self, key: object, *, strict: bool = False) -> bool:
         if not isinstance(key, type):
             return False
 
+        if strict:
+            return key in self.data
+
         return any(cls in self.data for cls in key.mro())
 
 

From 6d917d0eebd03990edf2443780a5f2506026ea78 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 14 Dec 2024 17:54:04 +0000
Subject: [PATCH 50/85] Enable mypy checking on V1 code (#11105)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tools/mypy.sh                            |  1 +
 vllm/v1/attention/backends/flash_attn.py |  2 ++
 vllm/v1/core/kv_cache_manager.py         | 10 +++---
 vllm/v1/core/kv_cache_utils.py           | 17 +++++-----
 vllm/v1/core/scheduler.py                |  1 +
 vllm/v1/engine/__init__.py               | 23 ++++++++-----
 vllm/v1/engine/async_llm.py              | 11 +++---
 vllm/v1/engine/core.py                   | 20 +++++------
 vllm/v1/engine/core_client.py            | 43 +++++++++++++-----------
 vllm/v1/engine/detokenizer.py            |  4 +--
 vllm/v1/engine/llm_engine.py             |  3 +-
 vllm/v1/engine/mm_input_mapper.py        | 20 +++++++----
 vllm/v1/engine/processor.py              |  2 +-
 vllm/v1/executor/abstract.py             | 12 ++-----
 vllm/v1/executor/multiproc_executor.py   | 15 +++++----
 vllm/v1/executor/uniproc_executor.py     |  7 ++--
 vllm/v1/request.py                       |  3 +-
 vllm/v1/utils.py                         | 42 ++++++++++++++---------
 vllm/v1/worker/gpu_input_batch.py        |  1 +
 vllm/v1/worker/gpu_model_runner.py       | 42 ++++++++++++++---------
 vllm/v1/worker/gpu_worker.py             |  2 +-
 21 files changed, 160 insertions(+), 121 deletions(-)

diff --git a/tools/mypy.sh b/tools/mypy.sh
index e984e739d70cf..2454ff9fde466 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -29,3 +29,4 @@ run_mypy vllm/plugins
 run_mypy vllm/prompt_adapter
 run_mypy vllm/spec_decode
 run_mypy vllm/worker
+run_mypy vllm/v1
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index c9f04ace644c7..026a0292cc339 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -135,6 +135,8 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        assert output is not None, "Output tensor must be provided."
+
         if attn_metadata is None:
             # Profiling run.
             return output
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 8044481a9cd6a..aaa44c930e324 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
@@ -263,12 +263,13 @@ def free(self, request: Request) -> None:
         """
         # Default to [] in case a request is freed (aborted) before alloc.
         blocks = self.req_to_blocks.pop(request.request_id, [])
+        ordered_blocks: Iterable[KVCacheBlock] = blocks
         if self.enable_caching:
             # Free blocks in reverse order so that the tail blocks are
             # freed first.
-            blocks = reversed(blocks)
+            ordered_blocks = reversed(blocks)
 
-        for block in blocks:
+        for block in ordered_blocks:
             block.decr_ref()
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
@@ -396,8 +397,7 @@ def _cache_full_blocks(
                 f"{request.request_id}({request})")
 
             # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash_value,
-                                           tuple(block_tokens))
+            block_hash = hash_block_tokens(prev_block_hash_value, block_tokens)
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 814e462a91fed..0ba338aa5a3d2 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,4 +1,5 @@
 """KV-Cache Utilities."""
+from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import List, NamedTuple, Optional, Tuple
 
@@ -13,7 +14,7 @@ class BlockHashType(NamedTuple):
     collision happens when the hash value is the same.
     """
     hash_value: int
-    token_ids: Tuple[int]
+    token_ids: Tuple[int, ...]
 
 
 @dataclass
@@ -79,8 +80,8 @@ def __init__(self, blocks: List[KVCacheBlock]) -> None:
         self.num_free_blocks = len(blocks)
 
         # Initialize the doubly linked list of free blocks.
-        self.free_list_head = blocks[0]
-        self.free_list_tail = blocks[-1]
+        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
+        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
         for i in range(self.num_free_blocks):
             if i > 0:
                 blocks[i].prev_free_block = blocks[i - 1]
@@ -159,7 +160,7 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
 
 
 def hash_block_tokens(parent_block_hash: Optional[int],
-                      curr_block_token_ids: Tuple[int]) -> BlockHashType:
+                      curr_block_token_ids: Sequence[int]) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -171,7 +172,7 @@ def hash_block_tokens(parent_block_hash: Optional[int],
     Args:
         parent_block_hash: The hash of the parent block. None
             if this is the first block.
-        curr_block_token_ids: A tuple of token ids in the current
+        curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
 
     Returns:
@@ -179,11 +180,11 @@ def hash_block_tokens(parent_block_hash: Optional[int],
         The entire tuple is used as the hash key of the block.
     """
     return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         curr_block_token_ids)
+                         tuple(curr_block_token_ids))
 
 
 def hash_request_tokens(block_size: int,
-                        token_ids: List[int]) -> List[BlockHashType]:
+                        token_ids: Sequence[int]) -> List[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
@@ -198,7 +199,7 @@ def hash_request_tokens(block_size: int,
     parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
         end = start + block_size
-        block_token_ids = tuple(token_ids[start:end])
+        block_token_ids = token_ids[start:end]
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f055eed77c372..f76364f64033d 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -152,6 +152,7 @@ def schedule(self) -> "SchedulerOutput":
                     break
             if not can_schedule:
                 break
+            assert new_blocks is not None
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index abeea052c1fa5..cc0c7ea23469a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -36,7 +36,7 @@ class EngineCoreRequest:
     prompt: Optional[str]
     prompt_token_ids: List[int]
     mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
-    mm_hashes: Optional[List[Optional[str]]]
+    mm_hashes: Optional[List[str]]
     mm_placeholders: Optional[MultiModalPlaceholderDict]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
@@ -44,10 +44,11 @@ class EngineCoreRequest:
     lora_request: Optional[LoRARequest]
 
 
-class EngineCoreOutput(msgspec.Struct,
-                       array_like=True,
-                       omit_defaults=True,
-                       gc=False):
+class EngineCoreOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     request_id: str
     new_token_ids: List[int]
@@ -56,10 +57,11 @@ class EngineCoreOutput(msgspec.Struct,
     stop_reason: Union[int, str, None] = None
 
 
-class EngineCoreOutputs(msgspec.Struct,
-                        array_like=True,
-                        omit_defaults=True,
-                        gc=False):
+class EngineCoreOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     #NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout and using an int enum for finish/stop reason
@@ -81,3 +83,6 @@ class EngineCoreRequestType(enum.Enum):
     ADD = b'\x00'
     ABORT = b'\x01'
     PROFILE = b'\x02'
+
+
+EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 24cafeff63d1e..b36de5f66917c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -81,7 +81,7 @@ def __init__(
             asyncio_mode=True,
         )
 
-        self.output_handler = None
+        self.output_handler: Optional[asyncio.Task] = None
 
     def __del__(self):
         self.shutdown()
@@ -126,7 +126,8 @@ def shutdown(self):
             handler.cancel()
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "mp":
@@ -361,10 +362,10 @@ async def check_health(self) -> None:
         logger.debug("Called check_health.")
 
     async def start_profile(self) -> None:
-        await self.engine_core.profile(True)
+        await self.engine_core.profile_async(True)
 
     async def stop_profile(self) -> None:
-        await self.engine_core.profile(False)
+        await self.engine_core.profile_async(False)
 
     @property
     def is_running(self) -> bool:
@@ -380,7 +381,7 @@ def errored(self) -> bool:
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception
+        return Exception()  # TODO: implement
 
 
 # Retain V0 name for backwards compatibility.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index af644fb5fedba..56d4dc67e4a0e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -5,7 +5,7 @@
 import time
 from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
-from typing import List, Tuple, Type, Union
+from typing import List, Tuple, Type
 
 import zmq
 import zmq.asyncio
@@ -20,7 +20,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType)
+                            EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -97,8 +97,10 @@ def add_request(self, request: EngineCoreRequest):
             # Note that the cache here is mirrored with the client side of the
             # MM mapper, so anything that has a hash must have a HIT cache
             # entry here as well.
-            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
-                request.mm_inputs, request.mm_hashes)
+            assert request.mm_inputs is not None
+            request.mm_inputs, request.mm_hashes = (
+                self.mm_input_mapper_server.process_inputs(
+                    request.mm_inputs, request.mm_hashes))
 
         req = Request.from_engine_core_request(request)
 
@@ -128,7 +130,7 @@ def step(self) -> List[EngineCoreOutput]:
     def shutdown(self):
         self.model_executor.shutdown()
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
 
@@ -161,8 +163,8 @@ def __init__(
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue = queue.Queue()
-        self.output_queue = queue.Queue()
+        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
+        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
                          daemon=True).start()
@@ -318,9 +320,7 @@ def _log_stats(self):
 
             self._last_logging_time = now
 
-    def _handle_client_request(
-        self, request: Union[EngineCoreRequest, EngineCoreProfile,
-                             List[str]]) -> None:
+    def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
 
         if isinstance(request, EngineCoreRequest):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e0bfe1b93b360..ff25a9b2e9cac 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,6 +1,6 @@
 import atexit
 import os
-from typing import List, Union
+from typing import List, Optional
 
 import msgspec
 import zmq
@@ -10,8 +10,9 @@
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType)
-from vllm.v1.engine.core import EngineCore, EngineCoreProc
+                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
+                                 EngineCoreProcHandle)
 from vllm.v1.serial_utils import PickleEncoder
 
 logger = init_logger(__name__)
@@ -59,7 +60,7 @@ def get_output(self) -> List[EngineCoreOutput]:
     def add_request(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         raise NotImplementedError
 
     def abort_requests(self, request_ids: List[str]) -> None:
@@ -71,6 +72,9 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
+    async def profile_async(self, is_start: bool = True) -> None:
+        raise NotImplementedError
+
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -105,7 +109,7 @@ def shutdown(self):
     def __del__(self):
         self.shutdown()
 
-    def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         self.engine_core.profile(is_start)
 
 
@@ -133,7 +137,10 @@ def __init__(
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context())
+        if asyncio_mode:
+            self.ctx = zmq.asyncio.Context()
+        else:
+            self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
         # Path for IPC.
         ready_path = get_open_zmq_ipc_path()
@@ -149,11 +156,13 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
+        self.proc_handle: Optional[EngineCoreProcHandle]
         self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
-            input_path=input_path,
-            output_path=output_path,
-            ready_path=ready_path,
+            input_path=
+            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
+            output_path=output_path,  # type: ignore[misc]
+            ready_path=ready_path,  # type: ignore[misc]
             **kwargs,
         )
         atexit.register(self.shutdown)
@@ -204,10 +213,8 @@ def get_output(self) -> List[EngineCoreOutput]:
         engine_core_outputs = self.decoder.decode(frame.buffer).outputs
         return engine_core_outputs
 
-    def _send_input(
-        self, request_type: EngineCoreRequestType,
-        request: Union[EngineCoreRequest, EngineCoreProfile,
-                       List[str]]) -> None:
+    def _send_input(self, request_type: EngineCoreRequestType,
+                    request: EngineCoreRequestUnion) -> None:
 
         # (RequestType, SerializedRequest)
         msg = (request_type.value, self.encoder.encode(request))
@@ -219,7 +226,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
                          EngineCoreProfile(is_start))
 
@@ -237,10 +244,8 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
 
         return engine_core_outputs
 
-    async def _send_input(
-        self, request_type: EngineCoreRequestType,
-        request: Union[EngineCoreRequest, EngineCoreProfile,
-                       List[str]]) -> None:
+    async def _send_input(self, request_type: EngineCoreRequestType,
+                          request: EngineCoreRequestUnion) -> None:
 
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
@@ -252,6 +257,6 @@ async def abort_requests_async(self, request_ids: List[str]) -> None:
         if len(request_ids) > 0:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    async def profile(self, is_start=True) -> None:
+    async def profile_async(self, is_start: bool = True) -> None:
         await self._send_input(EngineCoreRequestType.PROFILE,
                                EngineCoreProfile(is_start))
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6249d60199a62..02f34e2b54dd5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -97,7 +97,7 @@ def add_tokens(
         self,
         new_token_ids: List[int],
         finish_reason: Optional[str],
-        stop_reason: Optional[str],
+        stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c02494897b41f..15dedbd0f9529 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -103,7 +103,8 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "mp":
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 58ee29bedb201..cca27c2218af7 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import PIL
 from blake3 import blake3
@@ -42,14 +42,14 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
-        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
         self.mm_debug_cache_hit_ratio_steps = None
         self.mm_cache_hits = 0
         self.mm_cache_total = 0
 
-    def cache_hit_ratio(self, steps) -> float:
+    def cache_hit_ratio(self, steps):
         if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
                          self.mm_cache_hits / self.mm_cache_total)
@@ -60,7 +60,7 @@ def process_inputs(
         mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
         precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
-    ) -> List[MultiModalKwargs]:
+    ) -> Tuple[List[MultiModalKwargs], Optional[List[str]]]:
         if precomputed_mm_inputs is None:
             image_inputs = mm_data["image"]
             if not isinstance(image_inputs, list):
@@ -72,6 +72,7 @@ def process_inputs(
         # Check if hash is enabled
         use_hash = mm_hashes is not None
         if use_hash:
+            assert mm_hashes is not None
             assert num_inputs == len(
                 mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format(
                     num_inputs, len(mm_hashes))
@@ -79,7 +80,7 @@ def process_inputs(
         # Process each image input separately, so that later we can schedule
         # them in a fine-grained manner.
         # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_hashes = [] if use_hash else None
+        ret_hashes: Optional[List[str]] = [] if use_hash else None
         ret_inputs: List[MultiModalKwargs] = []
         for input_id in range(num_inputs):
             if self.mm_debug_cache_hit_ratio_steps is not None:
@@ -88,6 +89,7 @@ def process_inputs(
             mm_hash = None
             mm_input = None
             if use_hash:
+                assert mm_hashes is not None
                 mm_hash = mm_hashes[input_id]
                 mm_input = self.mm_cache.get(mm_hash)
 
@@ -105,12 +107,15 @@ def process_inputs(
 
                 if use_hash:
                     # Add to cache
+                    assert mm_hash is not None
                     self.mm_cache.put(mm_hash, mm_input)
             else:
                 self.mm_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
 
             if use_hash:
+                assert mm_hash is not None
+                assert ret_hashes is not None
                 ret_hashes.append(mm_hash)
             ret_inputs.append(mm_input)
 
@@ -120,17 +125,18 @@ def process_inputs(
 class MMInputMapperServer:
 
     def __init__(self, ):
-        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
     def process_inputs(
         self,
         mm_inputs: List[Optional[MultiModalKwargs]],
-        mm_hashes: List[Optional[str]],
+        mm_hashes: List[str],
     ) -> List[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
         full_mm_inputs = []
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            assert mm_hash is not None
             if mm_input is None:
                 mm_input = self.mm_cache.get(mm_hash)
                 assert mm_input is not None
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 903996bad3726..679bf8e25e9ca 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -56,7 +56,7 @@ def process_inputs(
         request_id: str,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
-        arrival_time: float,
+        arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 9cd267581ad18..564d0447f15a6 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Tuple
+from typing import Tuple
 
 from vllm.config import VllmConfig
 from vllm.v1.outputs import ModelRunnerOutput
@@ -28,7 +28,7 @@ def execute_model(
         raise NotImplementedError
 
     @abstractmethod
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         raise NotImplementedError
 
     @abstractmethod
@@ -38,11 +38,3 @@ def shutdown(self):
     @abstractmethod
     def check_health(self) -> None:
         raise NotImplementedError
-
-    @abstractmethod
-    def collective_rpc(self,
-                       method: str,
-                       timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> []:
-        raise NotImplementedError
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 14384a730ceec..17441dacdc5cf 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing.process import BaseProcess
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import zmq
 
@@ -21,6 +21,7 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_open_port,
                         get_open_zmq_ipc_path)
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import make_zmq_socket
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -31,7 +32,7 @@
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
 
-class MultiprocExecutor:
+class MultiprocExecutor(Executor):
 
     def __init__(self, vllm_config: VllmConfig) -> None:
         # Call self.shutdown at exit to clean up
@@ -103,7 +104,7 @@ def collective_rpc(self,
                        method: str,
                        timeout: Optional[float] = None,
                        args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> []:
+                       kwargs: Optional[Dict] = None) -> List[Any]:
         """
         Execute an RPC call on workers.
         
@@ -125,7 +126,7 @@ def collective_rpc(self,
 
             responses = [None] * self.world_size
             for w in self.workers:
-                dequeue_timeout = timeout - (time.monotonic() - start_time()
+                dequeue_timeout = timeout - (time.monotonic() - start_time
                                              ) if timeout is not None else None
                 status, result = w.worker_response_mq.dequeue(
                     timeout=dequeue_timeout)
@@ -153,7 +154,7 @@ def execute_model(
                                            args=(scheduler_output, ))[0]
         return model_output
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         self.collective_rpc("profile", args=(is_start, ))
         return
 
@@ -185,7 +186,6 @@ def wait_for_termination(procs, timeout):
                 p.kill()
 
         self._cleanup_sockets()
-        self.workers = None
 
     def _cleanup_sockets(self):
         for w in self.workers:
@@ -200,7 +200,8 @@ def shutdown(self):
             # again
             atexit.unregister(self.shutdown)
         """Properly shut down the executor and its workers"""
-        if (hasattr(self, 'workers') and self.workers is not None):
+        if getattr(self, 'shutting_down', False):
+            self.shutting_down = True
             for w in self.workers:  #TODO: not sure if needed
                 w.worker_response_mq = None
             self._ensure_worker_termination()
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index 9b1d9a40950c6..be058318de58b 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -4,13 +4,14 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_worker import Worker
 
 logger = init_logger(__name__)
 
 
-class UniprocExecutor:
+class UniprocExecutor(Executor):
 
     def __init__(self, vllm_config: VllmConfig) -> None:
         self.vllm_config = vllm_config
@@ -25,7 +26,7 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
-        self.worker = self._create_worker()
+        self.worker: Worker = self._create_worker()
         self.worker.initialize()
         self.worker.load_model()
 
@@ -75,7 +76,7 @@ def profile(self, is_start: bool = True):
         self.worker.profile(is_start)
 
     def shutdown(self):
-        self.worker = None
+        pass
 
     def check_health(self) -> None:
         # UniprocExecutor will always be healthy as long as
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 6bc1e4d5c769f..1737d096e811d 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -52,10 +52,9 @@ def __init__(
         else:
             self.mm_positions = []
         # Output of the mm input mapper (e.g., image tensors).
+        self.mm_inputs: List[MultiModalKwargs] = []
         if self.inputs.multi_modal_inputs:
             self.mm_inputs = self.inputs.multi_modal_inputs
-        else:
-            self.mm_inputs: List[MultiModalKwargs] = []
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6ecf20e717ca3..5f327d7066830 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
+from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Any, Generic, Iterator, List, TypeVar, overload
+from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
+                    overload)
 
 import zmq
 
@@ -11,7 +13,7 @@
 T = TypeVar("T")
 
 
-class ConstantList(Generic[T]):
+class ConstantList(Generic[T], Sequence):
 
     def __init__(self, x: List[T]) -> None:
         self._x = x
@@ -34,29 +36,33 @@ def remove(self, item):
     def clear(self):
         raise Exception("Cannot clear a constant list")
 
-    def index(self, item):
-        return self._x.index(item)
+    def index(self,
+              item: T,
+              start: int = 0,
+              stop: Optional[int] = None) -> int:
+        return self._x.index(item, start,
+                             stop if stop is not None else len(self._x))
 
     @overload
-    def __getitem__(self, item) -> T:
+    def __getitem__(self, item: int) -> T:
         ...
 
     @overload
     def __getitem__(self, s: slice, /) -> List[T]:
         ...
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]:
         return self._x[item]
 
     @overload
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: int, value: T):
         ...
 
     @overload
-    def __setitem__(self, s: slice, value, /):
+    def __setitem__(self, s: slice, value: T, /):
         ...
 
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]):
         raise Exception("Cannot set item in a constant list")
 
     def __delitem__(self, item):
@@ -73,10 +79,12 @@ def __len__(self):
 
 
 @contextmanager
-def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
+def make_zmq_socket(
+        path: str,
+        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
     """Context manager for a ZMQ socket"""
 
-    ctx = zmq.Context()
+    ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
         socket = ctx.socket(type)
 
@@ -96,20 +104,24 @@ def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
         ctx.destroy(linger=0)
 
 
-class LRUDictCache:
+K = TypeVar('K')
+V = TypeVar('V')
+
+
+class LRUDictCache(Generic[K, V]):
 
     def __init__(self, size: int):
-        self.cache = OrderedDict()
+        self.cache: OrderedDict[K, V] = OrderedDict()
         self.size = size
 
-    def get(self, key, default=None):
+    def get(self, key: K, default=None) -> V:
         if key not in self.cache:
             return default
 
         self.cache.move_to_end(key)
         return self.cache[key]
 
-    def put(self, key, value):
+    def put(self, key: K, value: V):
         self.cache[key] = value
         self.cache.move_to_end(key)
         if len(self.cache) > self.size:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9046b37f60005..5c113c74778df 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -215,6 +215,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
             # Swap the states.
             req_id = self.req_ids[last_req_index]
+            assert req_id is not None
             self.req_ids[empty_index] = req_id
             self.req_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f24942068d1f8..abcd4b007a326 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Tuple
+from typing import TYPE_CHECKING, Dict, List, Tuple, cast
 
 import numpy as np
 import torch
@@ -193,9 +193,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
-        for req_data in scheduler_output.scheduled_new_reqs:
-            req_id = req_data.req_id
-            sampling_params = req_data.sampling_params
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
             if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
                 generator = torch.Generator(device=self.device)
                 generator.manual_seed(sampling_params.seed)
@@ -204,25 +204,25 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
-                prompt_token_ids=req_data.prompt_token_ids,
-                prompt=req_data.prompt,
-                mm_inputs=req_data.mm_inputs,
-                mm_positions=req_data.mm_positions,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                prompt=new_req_data.prompt,
+                mm_inputs=new_req_data.mm_inputs,
+                mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 generator=generator,
-                block_ids=req_data.block_ids,
-                num_computed_tokens=req_data.num_computed_tokens,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
             )
             req_ids_to_add.append(req_id)
 
         # Update the cached states of the resumed requests.
-        for req_data in scheduler_output.scheduled_resumed_reqs:
-            req_id = req_data.req_id
+        for res_req_data in scheduler_output.scheduled_resumed_reqs:
+            req_id = res_req_data.req_id
             req_state = self.requests[req_id]
 
-            req_state.block_ids = req_data.block_ids
-            req_state.num_computed_tokens = req_data.num_computed_tokens
+            req_state.block_ids = res_req_data.block_ids
+            req_state.num_computed_tokens = res_req_data.num_computed_tokens
             req_ids_to_add.append(req_id)
 
         # Add the new or resumed requests to the persistent batch.
@@ -259,6 +259,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         num_scheduled_tokens = []
         max_num_scheduled_tokens = 0
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_scheduled_tokens.append(num_tokens)
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
@@ -373,7 +374,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
 
         # Batch the multi-modal inputs.
         mm_inputs: List[MultiModalKwargs] = []
-        req_input_ids: List[Tuple[int, int]] = []
+        req_input_ids: List[Tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
             for input_id in encoder_input_ids:
@@ -406,6 +407,7 @@ def _gather_encoder_outputs(
         encoder_outputs: List[torch.Tensor] = []
         num_reqs = self.input_batch.num_reqs
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
             req_state = self.requests[req_id]
@@ -514,6 +516,7 @@ def execute_model(
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
         for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+            assert req_id is not None
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
@@ -539,8 +542,15 @@ def execute_model(
             logprobs = None
         else:
             logprobs = sampler_output.logprobs.cpu()
+
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
         model_runner_output = ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids[:num_reqs],
+            req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
             logprob_token_ids_cpu=logprob_token_ids,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 49e415ab72e0b..33491f700de10 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -204,7 +204,7 @@ def execute_model(
         return output if self.rank == 0 else None
         return output
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
         if is_start:

From 886936837ca89e5645bc1f71cc0e1492b65b1590 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Sun, 15 Dec 2024 04:38:10 +0900
Subject: [PATCH 51/85] [Performance][Core] Optimize the performance of evictor
 v1 and v2 by applying a priority queue and lazy deletion (#7209)

---
 vllm/core/evictor.py | 63 ++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index ed7e06cab2996..44adc4158abec 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,6 +1,7 @@
 import enum
+import heapq
 from abc import ABC, abstractmethod
-from typing import OrderedDict, Tuple
+from typing import Dict, List, Tuple
 
 
 class EvictionPolicy(enum.Enum):
@@ -75,8 +76,14 @@ class LRUEvictor(Evictor):
     highest num_hashed_tokens value, then one will be chose arbitrarily
     """
 
+    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
+    # queue relative to the free table size. When this threshold is exceeded,
+    # a cleanup operation is triggered to reduce memory usage.
+    CLEANUP_THRESHOLD = 50
+
     def __init__(self):
-        self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict()
+        self.free_table: Dict[int, BlockMetaData] = {}
+        self.priority_queue = []
 
     def __contains__(self, block_id: int) -> bool:
         return block_id in self.free_table
@@ -85,34 +92,50 @@ def evict(self) -> Tuple[int, int]:
         if len(self.free_table) == 0:
             raise ValueError("No usable cache memory left")
 
-        evicted_block, evicted_block_id = None, None
-        # The blocks with the lowest timestamps should be placed consecutively
-        # at the start of OrderedDict. Loop through all these blocks to
-        # find the one with maximum number of hashed tokens.
-        for _id, block in self.free_table.items():
-            if evicted_block is None:
-                evicted_block, evicted_block_id = block, _id
-                continue
-            if evicted_block.last_accessed < block.last_accessed:
-                break
-            if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
-                evicted_block, evicted_block_id = block, _id
-
-        assert evicted_block is not None
-        assert evicted_block_id is not None
-        self.free_table.pop(evicted_block_id)
-
-        return evicted_block_id, evicted_block.content_hash
+        while self.priority_queue:
+            # We do not remove outdated entries from the priority queue at the
+            # time of updating the last_accessed timestamp. Instead, outdated
+            # entries are filtered out here during eviction. Outdated entries
+            # would either not in the free table, or have older last accessed
+            # time.
+            last_accessed, _, block_id, content_hash = heapq.heappop(
+                self.priority_queue)
+            if (block_id in self.free_table and
+                    self.free_table[block_id].last_accessed == last_accessed):
+                self.free_table.pop(block_id)
+                return block_id, content_hash
+
+        raise ValueError("No usable cache memory left")
 
     def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
             last_accessed: float):
         self.free_table[block_id] = BlockMetaData(content_hash,
                                                   num_hashed_tokens,
                                                   last_accessed)
+        heapq.heappush(
+            self.priority_queue,
+            (last_accessed, -num_hashed_tokens, block_id, content_hash))
+        self._cleanup_if_necessary()
 
     def update(self, block_id: int, last_accessed: float):
         self.free_table[block_id].last_accessed = last_accessed
 
+    def _cleanup_if_necessary(self):
+        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
+                self.free_table):
+            self._cleanup()
+
+    def _cleanup(self):
+        new_priority_queue: List[Tuple[float, int, int, int]] = []
+
+        for block_id, block in self.free_table.items():
+            new_priority_queue.append(
+                (block.last_accessed, -block.num_hashed_tokens, block_id,
+                 block.content_hash))
+        heapq.heapify(new_priority_queue)
+
+        self.priority_queue = new_priority_queue
+
     def remove(self, block_id: int):
         if block_id not in self.free_table:
             raise ValueError(

From 15859f2357059ef488405e5336d2c6e5d246687b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 15 Dec 2024 11:03:06 +0800
Subject: [PATCH 52/85] [[Misc]Upgrade bitsandbytes to the latest version
 0.45.0 (#11201)

---
 Dockerfile                                              | 2 +-
 docs/source/quantization/bnb.rst                        | 2 +-
 requirements-test.in                                    | 2 +-
 requirements-test.txt                                   | 2 +-
 vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 682f046d4b6ec..c1b6e1bbfe354 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -218,7 +218,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
index 682938cc63d48..84f805bb60c2a 100644
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
@@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
 
 .. code-block:: console
 
-    $ pip install bitsandbytes>=0.44.0
+    $ pip install bitsandbytes>=0.45.0
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
 
diff --git a/requirements-test.in b/requirements-test.in
index 57fddb416317e..fb4179c3d8423 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -25,7 +25,7 @@ datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
 # quantization
-bitsandbytes>=0.44.0
+bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
 
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index c786a1249bddb..3771577fe8ed0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -37,7 +37,7 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements-test.in
-bitsandbytes==0.44.1
+bitsandbytes>=0.45.0
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e01c713dd14db..5dc872933282c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -145,12 +145,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.44.0":
+            if bitsandbytes.__version__ < "0.45.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.44.0.")
+                                  "install bitsandbytes>=0.45.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.44.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.0 via "
+                              "`pip install bitsandbytes>=0.45.0` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config

From a1c02058baf47be1a91ee743378a340ee1b10416 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 14 Dec 2024 19:45:00 -0800
Subject: [PATCH 53/85] [torch.compile] allow tracking forward time (#11081)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/forward_context.py | 61 ++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index cd136f43c0c57..7f56575279e9b 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,9 +1,11 @@
 import time
-from collections import Counter
+from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Dict, Optional
 
+import torch
+
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -11,9 +13,10 @@
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
-batchsize_counter: Counter = Counter()
 last_logging_time: float = 0
+forward_start_time: float = 0
 batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
+batchsize_forward_time: defaultdict = defaultdict(list)
 
 
 @dataclass
@@ -40,23 +43,10 @@ def set_forward_context(context: Any, vllm_config: VllmConfig):
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
     """
-    global track_batchsize, batchsize_counter
-    global last_logging_time, batchsize_logging_interval
-    if track_batchsize and context is not None:
-        if hasattr(context, "num_prefill_tokens"):
-            # for v0 attention backends
-            batchsize = context.num_prefill_tokens + context.num_decode_tokens
-        else:
-            # for v1 attention backends
-            batchsize = context.num_input_tokens
-        batchsize_counter[batchsize] += 1
-        if time.monotonic() - last_logging_time > batchsize_logging_interval:
-            last_logging_time = time.monotonic()
-            sorted_data = sorted(batchsize_counter.items(),
-                                 key=lambda x: x[1],
-                                 reverse=True)
-            logger.info("Batchsize distribution (batchsize, count): %s",
-                        sorted_data)
+    global forward_start_time
+    need_to_track_batchsize = track_batchsize and context is not None
+    if need_to_track_batchsize:
+        forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
@@ -66,4 +56,37 @@ def set_forward_context(context: Any, vllm_config: VllmConfig):
     try:
         yield
     finally:
+        global batchsize_counter
+        global last_logging_time, batchsize_logging_interval
+        if need_to_track_batchsize:
+            if hasattr(context, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = context.num_prefill_tokens + \
+                    context.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = context.num_input_tokens
+            # we use synchronous scheduling right now,
+            # adding a sync point here should not affect
+            # scheduling of the next batch
+            torch.cuda.synchronize()
+            now = time.perf_counter()
+            # time measurement is in milliseconds
+            batchsize_forward_time[batchsize].append(
+                (now - forward_start_time) * 1000)
+            if now - last_logging_time > batchsize_logging_interval:
+                last_logging_time = now
+                forward_stats = []
+                for bs, times in batchsize_forward_time.items():
+                    if len(times) <= 1:
+                        # can be cudagraph / profiling run
+                        continue
+                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
+                    medium = round(medium, 2)
+                    forward_stats.append((bs, len(times), medium))
+                forward_stats.sort(key=lambda x: x[1], reverse=True)
+                if forward_stats:
+                    logger.info(("Batchsize forward time stats "
+                                 "(batchsize, count, median_time(ms)): %s"),
+                                forward_stats)
         _forward_context = prev_context

From b10609e6a11554be61976981304984510a0469c9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 15 Dec 2024 14:30:28 +0800
Subject: [PATCH 54/85] [Misc] Clean up multi-modal processor (#11207)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py |  5 +-
 tests/multimodal/test_processing.py           | 17 ++++---
 vllm/multimodal/processing.py                 | 48 +++++++++----------
 3 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 45539c665a922..7bc43242b717e 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -92,10 +92,7 @@ def run_fuyu(question: str, modality: str):
 def run_phi3v(question: str, modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (128k) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 6aaa80ddc9fa5..d22d778f81fa8 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -2,10 +2,9 @@
 
 import pytest
 
-from vllm.multimodal.processing import (MultiModalDataItems, PromptReplacement,
-                                        _PlaceholderInfo, find_text_matches,
-                                        find_token_matches, iter_placeholders,
-                                        iter_token_matches,
+from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
+                                        find_text_matches, find_token_matches,
+                                        iter_placeholders, iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -314,8 +313,8 @@ def test_find_replace_text(
     result = replace_text_matches(
         prompt,
         matches,
-        MultiModalDataItems({key: [None] * mm_count
-                             for key in repl_by_key}),
+        {key: mm_count
+         for key in repl_by_key},
     )
 
     # Only displayed on error
@@ -380,8 +379,8 @@ def test_find_replace_tokens(
     result = replace_token_matches(
         prompt,
         matches,
-        MultiModalDataItems({key: [None] * mm_count
-                             for key in repl_by_key}),
+        {key: mm_count
+         for key in repl_by_key},
     )
 
     # Only displayed on error
@@ -476,7 +475,7 @@ def test_iter_placeholders(
             prompt_repls,
             prompt,
             # Effectively match all occurrences in the prompt
-            MultiModalDataItems({key: [None] * 3 for key in repl_by_key}),
+            {key: 3 for key in repl_by_key},
          ))
 
     # Only displayed on error
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index de5a002d474c2..ce6bec1d49aac 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -403,18 +403,17 @@ def _resolve_matches(
 def _replace_matches(
     prompt: _S,
     matches: Sequence[_PromptReplacementMatch],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_items}
+    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
-        modal_items = mm_items[modality]
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= len(modal_items):
+        if item_idx >= mm_item_counts[modality]:
             continue
 
         start_idx = match.start_idx
@@ -441,13 +440,13 @@ def _replace_matches(
 def replace_token_matches(
     prompt: list[int],
     matches: Sequence[_PromptReplacementTokenMatch],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> list[int]:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    token_id_seqs = _replace_matches(prompt, matches, mm_items)
+    token_id_seqs = _replace_matches(prompt, matches, mm_item_counts)
 
     return flatten_2d_lists(token_id_seqs)
 
@@ -455,13 +454,13 @@ def replace_token_matches(
 def replace_text_matches(
     prompt: str,
     matches: Sequence[_PromptReplacementTextMatch],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> str:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    texts = _replace_matches(prompt, matches, mm_items)
+    texts = _replace_matches(prompt, matches, mm_item_counts)
 
     return "".join(texts)
 
@@ -470,9 +469,9 @@ def _iter_modality_placeholders(
     prompt: list[int],
     modality: str,
     modality_repls: Sequence[_BoundPromptReplacement],
-    modal_items: list[Any],
+    modal_item_count: int,
 ) -> Iterable[_PlaceholderInfo]:
-    if len(modal_items) == 0:
+    if modal_item_count == 0:
         return
 
     prompt_len = len(prompt)
@@ -499,7 +498,7 @@ def _iter_modality_placeholders(
                 )
 
                 item_index += 1
-                if item_index >= len(modal_items):
+                if item_index >= modal_item_count:
                     return
 
                 # Exclude overlapping matches
@@ -514,7 +513,7 @@ def _iter_modality_placeholders(
 def iter_placeholders(
     prompt_repls: Sequence[_BoundPromptReplacement],
     prompt: list[int],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> Iterable[_PlaceholderInfo]:
     """
     Yield each set of placeholder tokens found in :code:`prompt`.
@@ -523,13 +522,13 @@ def iter_placeholders(
     """
     repls_by_modality = dict(full_groupby_modality(prompt_repls))
 
-    for modality, modal_items in mm_items.items():
+    for modality, modal_item_count in mm_item_counts.items():
         if modality in repls_by_modality:
             yield from _iter_modality_placeholders(
                 prompt,
                 modality,
                 repls_by_modality[modality],
-                modal_items,
+                modal_item_count,
             )
 
 
@@ -590,10 +589,10 @@ def _find_placeholders(
         self,
         all_prompt_repls: Sequence[_BoundPromptReplacement],
         new_token_ids: list[int],
-        mm_items: MultiModalDataItems,
+        mm_item_counts: Mapping[str, int],
     ) -> list[_PlaceholderInfo]:
         return list(
-            iter_placeholders(all_prompt_repls, new_token_ids, mm_items))
+            iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
     def _apply_hf_processor(
         self,
@@ -655,10 +654,9 @@ def _bind_prompt_replacements(
 
     def _apply_prompt_replacements(
         self,
-        mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
         token_ids: list[int],
         prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
         tokenizer = self._get_tokenizer()
 
@@ -675,13 +673,13 @@ def _apply_prompt_replacements(
         # of the search text in the prompt, we instead perform string
         # replacement on the decoded token IDs, then encode them back.
         if all(
-            len(matches) >= len(mm_items[modality])
+            len(matches) >= mm_item_counts[modality]
             for modality, matches in full_groupby_modality(token_matches)
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
                 token_matches,
-                mm_items,
+                mm_item_counts,
             )
 
             text = _decode(tokenizer, token_ids)
@@ -693,14 +691,14 @@ def _apply_prompt_replacements(
             text = replace_text_matches(
                 text,
                 text_matches,
-                mm_items,
+                mm_item_counts,
             )
 
             token_ids = _encode(tokenizer, text)
             matched_repls = [match.prompt_repl for match in text_matches]
 
         placeholders = self._find_placeholders(matched_repls, token_ids,
-                                               mm_items)
+                                               mm_item_counts)
 
         return token_ids, text, placeholders
 
@@ -737,8 +735,9 @@ def apply(
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
+        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
         all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids, mm_items)
+                                                   prompt_ids, mm_item_counts)
 
         if all_placeholders:
             prompt_text = _decode(tokenizer, prompt_ids)
@@ -748,10 +747,9 @@ def apply(
                 prompt_text,
                 all_placeholders,
             ) = self._apply_prompt_replacements(
-                mm_items,
-                hf_inputs,
                 prompt_ids,
                 all_prompt_repls,
+                mm_item_counts,
             )
 
         mm_placeholders = {

From 96d673e0f897aa8eec234e690c9c5425782d6ffb Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 16 Dec 2024 01:59:42 +0800
Subject: [PATCH 55/85] [Bugfix] Fix error handling of unsupported sliding
 window (#11213)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llama.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 733b1bc7d80ac..2902e6999c2fd 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -169,13 +169,15 @@ def __init__(
         )
 
         if hasattr(config, "interleaved_sliding_window"):
-            if isinstance(config.interleaved_sliding_window, int):
-                sliding_window = config.interleaved_sliding_window
-            elif isinstance(config.interleaved_sliding_window, list):
-                sw_idx = layer_idx % len(config.interleaved_sliding_window)
-                sliding_window = config.interleaved_sliding_window[sw_idx]
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
             else:
-                raise ValueError(f"{type(sliding_window)} is not supported.")
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported.")
         else:
             sliding_window = None
 

From 38e599d6a84bb7477030a5488035cd23f529b644 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 15 Dec 2024 13:31:16 -0600
Subject: [PATCH 56/85] [Doc] add documentation for disaggregated prefilling
 (#11197)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 .../usage/disagg_prefill/abstraction.jpg      | Bin 0 -> 104673 bytes
 .../assets/usage/disagg_prefill/overview.jpg  | Bin 0 -> 177439 bytes
 docs/source/index.rst                         |   1 +
 docs/source/usage/disagg_prefill.rst          |  69 ++++++++++++++++++
 4 files changed, 70 insertions(+)
 create mode 100644 docs/source/assets/usage/disagg_prefill/abstraction.jpg
 create mode 100644 docs/source/assets/usage/disagg_prefill/overview.jpg
 create mode 100644 docs/source/usage/disagg_prefill.rst

diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/usage/disagg_prefill/abstraction.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a99e3ed8cf5f3b6679196752896fca94a22a4a4
GIT binary patch
literal 104673
zcmeFa2UHW^x+p&MF4B<}DbkB{Q6wVLL<FRR5Ru-IjzpqJks=5RD2PZ`s&pw5=^`T1
z1VRT9fglND2&Vl<f92eD&v|Rzb>F$`|JHksOy<kZp1t?C_qV@3BW0Sh41Cd#^!5M%
zQ&T_|002gSo{AHo0U;{zA3!Al(EWk|z=lfjZ`hei`p-Jl06@+4kNSz;0PUak!8U&#
z_x}9;$CVMhFn}er$5m96GpPQ6si<jy%!FSsC-~dDr+?K^K7RZu)jwK)O8r;sj8~u1
z`~_1rm(c%N2fX>`p=+sUYz$s4U4z}-146t4Lji!Y!X|I5rw22?WM*J|K_8$3v11c*
z2@3L~J`4c<0inT{49_05vv)Yk@C9H2Sb@WUJRs=e8WN;yZhqlctAAd9!v8pMAAimh
z0E{dAZ0jH8|0RgS%{{~woMm&c_EonKSAP(01W%l6P%t=VnkukdJR&scC;S0~xr4z0
zg7BxGu;(8z{wI9(7fj3gtB%zrJ#d1Lf{V%O;u7o$0Ic$0`O$D!Pp}=<3m|+vz|A`V
zgyA5p<L@8n2ErX6EDhEP0^y(SpSb!T+VT7ku#3ypzuR<i@%$V90~hd4u;X*yAwhmF
zQNO<YJ3svW!od0Zbx4ArtX{!Jm%ytCIA_zofhK>$0aq`Y{Z;M_V)qy96=r1p7wj6M
z2hQam_qw^9Gx-Y+@VoePj()Z2;cf7fkbc45p~hB!!R{gYmVd#)ftUX7%OhA%_pfr7
zU=aHA-Y{S5zstScjsI@{x|fyhUv<L0Z7%)=hxl6lJ)Y}cdViM(hh6%6{O$pUXaB12
z5p3``?*1WQ>7V_2c^m(IZ>X1*(O>1RF8>(6dw}jg+6=M#du(p*=k))o?;c?NH@-ok
zXaDXi$nV$Q_%%Q7eujUShlE@HeQ#*6)!*eoq2_<jp^wY?pLGB0UceIY2HXI5z!h)-
zf`OyJIq-W35Cr&v-$618b&mvT7Xb7EgQ9}HJ-tGY>Vmt?{itz(tD^i-mE$UE0Pu6~
z{;UH4-z<LZNh;oV|0uh?2mq&%6bhyNA7$r}08pn10L-obD3d+`030j;@TS!@EI9m+
z_Go`z=m2Jb9pDCzfFl+MqyYs$1<(La0cQYx;5=XkSONBcGdO-Pz#qgQ0)PXzfCL}~
zxCdkbxj+F>3_Js>fqI|?=m5Haeqadr089e2z!I<qYylVm2N0;Js2Hf&sJN((P(i4U
zQOQxMP@SOCq0*<iKy``Ap6UvfCshDd7!{oAHdP8$CRHv~A=PuLI;vKxE~-JQQK}iL
zWvUITU8*0{G}Nrr+|<I<lGIAnC#ZF)O{lG?ov1yjuTfv8j-yVaen?$JT}9nY-9<e_
zJxRSpy+!?<2B2Y~;iVC!QJ^_Nqeo*#<3Qs<6HF6BlT4FMQ$$lk(?&BuGfuNagQhv4
zrKjbhJxZ%Udy3YG)|%Fp_8M&rZ7OXp?K9eD+Fsgm+GSb{Et!suPKZv9?j)TtogJMg
zT?AbMT{c}AT@zg&-6Y)_9gd!!o|j&VUX$L4-k#ovKAJw2KA*mp{x$t5{VM%;1_p*B
z3~~&o8O#`57(y8m7#=ZHF(4Vn7}gl@j4X^0MioW_MhC_K##qK|#tOzx#xcfqMgr4e
zrejPenar5nn4*}{nM#=2m`0e^m<Y@q%u>v!nXQ<8m~SyZWUgWEW1eIF#=^oP#-hby
z!Q#zwljR}H3zj!5%Pa@1oUF2}daRdO!&uW;%UNHu&ai%CV`GzKJHzI{7Rr{!_MEMU
zZGjDci0hEzA>%`ChY*MI4mBSdJM@*EiTxP6F1s`Pb@pub2KEtl^kJsMl85yUyBv-=
zoOihO@YLb&99$gA9A+H;9H|@?9D^KRI2kx4IrTZ+IO8~rIeR!)xoEh=x%9YPxo&fn
zaP@Jmano~4aT{~{aHnurbHC@t@Nn{|@!0T0^5pTn;#uOQ;g#ey=Jn%E=dI`c$oqp&
zm`|6_oiB;6hHs2-pI?yw48J>nGJh@qB>#^iM~~<q@ja4pq~*xG0Ih(mfQ3MWz!QN!
z0kj~u;3+{@!DPXDL6i`+kgU)pp=hCEp?5;xg++u7g@c423HJ!2MR-Ndi1>&+5a|@z
zfN(){Al{G%5F}*tD9=&dqkcyp9_>Af5fu_O5)BhA6n!sB5R($K7K;_D6`L1l7C#~G
zA^t$TTO1<+kua5jOH@jrjxilOam?#j&ar`G`;t<UwvvgGEs|fP_@#`cu1i%)%}KLK
zL#3}t7fDaZ(94{V@s-J!`5;Rzt1jy$n<qOWM=hrz=Pj2nH!4pjuO%NKUnD=Jz^b6D
z5Ux<Eu&l_dctJ5%u|@H#l7y0jQijri68X6Lalhjw$7hu}mCq~3Dz_=`s>rIis^qCm
zsIsXVs3KHfs$$jT)ZEpcs7<SLshg@Ns&}iCG&D7WHEJ|AG$k~zXg=1QI>CLy{6y-B
zH(GRBXSEPo9a;w`HBN?}tUtMPO5v36sftryw57D&wM(>DPm7&)IbC>qN$04}6`iL#
zOHfg$3$zHjaz^}&+nKU6>$)<!KDt%9U(X&t8+^9u?7rSfy&HP3&(WMSJePd#y*{VD
zwSKPtf`OQUr$MCw#!%fb+VHgzoso%AhS8L<u(7N0b7RbT&GR?T_nNSpSeoRTtXz=2
z5PYG{l-l&X>3!3=i^nboTx>A|%#6+Mo6Va`nO`$+x1hH$x5%^jd`bBd{L+9WmnF=y
z!V+h7)+*g<)>_&+%(~l#-R82*bDMqJbGDhbOLj_jH|&P&1?;`;Upg>2*f^9qe0S7$
z%yL}2tbRG+@<%5rr%0zkX8~tF=T|Uxm<z1o3f&dkD-~CWE@m!8F5j;jUwwQP<ErPH
z<GST`#_fUIhWlyvO!sw<(;k@~Up!BH-uK+_f_i0pp}qCIbG@-XMm_~T`@Uwr<-Qa@
zTfbU=M*l1RtpS_?z5%^~B7u>Cqd{^(2|-KOPF~Bph6y$aE(@UwaSC}E$_*+>@4}?R
z;=`81b;9$)@ewu=4Urs?0g>;bWTTRz)~_2}FO8;+c8%_ai^6Zgmv89aD2$<sxf;`r
z5J$uz)@~Z!e0GcZmhY|M*yFL8v3s{|Z@0w>$K8xujW>+1OgNMfoG_JmDzV@W?H%ts
z?~_!M9wm{J-ICv?98bwkA*Q;fzP+n__u*Yinn&9EbdB^U_vr5h-20dT&3Kl{k$F9H
z^}gx-<_AX~BtO{8f@KY6t7RADFz1BjEIqvN@Z}?kNB15PbG>pW^UmhgJ{EeM^msqt
zEr0CEnI|;`!UZV>Kc0F&oh~#iY%Y>4$}Xla4ln*(VpB3udZP4snLt@e8L2#=eCgSx
zXT8rgpFgh<sz|S-t_-W(sJdMBq553)%Nm86!dl+ilo!B@uos(kSL#02o78tVoM@<N
zJl6QQiK{8OnW{Oe8Pnp`vh>pa<!I~q*6y}bZB6Zp?d2V!9eJ;KUZr<3cgA&6kk^sl
zUkANLcX@WLcEh@7d+d59dM$e2_nGv)={M-_8PFX-zB&D-WANl)>)R7=TZS}-n%}9v
zYZ_J`ZhEirzIjA*<mCsg4{f8`qn%^Wv99rR<NXuH6GM|1Cr3Y8f1IAWJhe3KHvMJB
zZw8ABM-gUk&e6>!&mW%8UJzO+T9jF=UD8;3wR~>*-HPSP?5gYP)~Aq9#I?B3Y@f5&
zA?we-sC?<zFxdF8>A1PJ6|_Y_Cv0<UKmIEHwQ=Xn&O3}9W(^yRrR=7DJMyi3Pi?RJ
zyZQG8oIj4RpNv0(e|B)<VBm-Ck99%>k%9P-BtvQ=n~>)y{uD}(OR&o?UErr>MC}3E
zHoN7Z{a_COT%cag?D&Un{j0_=yXj8|R7-zBe_a0q{D=Pi>o*MmsNn_x3pA+ROaZ`)
z5&)0{+h@NF0Oo7}V5J36tE&Ee{@L6gIy(TkxCHrmo4dH_3@iSj<^K8mUzb1hdGOm`
z<NnWA5F1rhP<i@KSIR692bxGf4{Gp_hMJn1hL#3AK+*s6pl1N3H+X<j`q#nu>tOnM
zfI{`pgX-s9G&FSJ9}6Qr<6p)9^($o_wDA%tuK^B5s%FNkG*n^$H3t<92Nk84stBA@
zx<5wv7qZ|uY3b-07@3$^4g=IwG&Iz-G<0;dp!P;}<EM#9%R$E}s;o=TW#Pggc8y!*
zRz@MC_}RKHo=c+`3Dv8?u}sXoeEdfQj!8;M%gCy!YiOR(I;nR~-@wqw_`Ic+wT&&P
zCc3)0dw6<z`-Fssg-1k2UB4X{pOAPbDLM20gRJbFhmUfLic3n%%AY;2sBdU&YHoSi
z+Sc9E+t)wvX7KIU_{8MLsp%Qi^2+L`wa@EcHa4-l-}b)a_VEWlKs)yjbimI)F!~Sp
zaDezw)6&w=GW^7ciaG*3X*g)<M3w0|buAcNu5pQ}++yTDn^9QT#U!qJ3Bz+Wc$Aq}
zLT&jN_9sfeF#4|{6#L)8=x>Dn#)pCeSZS!h1*72rpa7XHU6c&`K7Y@_zoG%khu`$^
zujb`<*&q9bGWwf7e$&GLns-vh{-6)J6nw@i1z;gc;Uj9{me>vo;64ghFL&~3`R1zQ
zRQ<qIzB(Oe!BjKW$x8uZ3n{=`RSMu$K#<f?paA36!vI2~pkX8qxl~zzaW-RjikU?%
zZquzO&Eze8Zi<dzxJ8s&%E$Ov+M&*WNXZ*|xpr&1L7XOY6V0Tbe7e2!4h2YLA<sbf
z?gge5;7_zsfKP7;j$}p}3LqVXXP^LAzacg*OcL16K!zd26ENH$1t_@<52XNIeNz-b
z(t-jU;v>}~HjXX|LowIFD8OJANzvkW?ca0qn@)c3hTk0GH`n|v8-B~n-&(<MP3gDx
z_*+N*Z8!Y3j(%H=zpd!szJlNWliyyV-=4GI-o)QN)BijD$ZyuSMKSlu%%F3h_6$ab
z)T+Z5GrAy7Dkcj>dPG9{7Y<=@TY2|O2gMA&&CDCCeND|voZBVVean!gKTmGcVTIdc
z0tb+{$=vvL%%;st3<XHtixv*KkTD_GR947r#^g@Bgua&Xgq`QWodVD`QGf@=F&_$W
z5}Sx0im$fFjNd>PejzCjfzFL|r2wWFbA~V9xh>AnfhUAA{%81MIS?-kAr?b%Nfe;o
zb!CU7etBwM9n(VOVJBM@5YAD68D(q)Ik(1(W?2b;l$}`Evjicf3{!woRa>-rj5Zhu
z01#`3AOHnGI|q_va*;02fTCpS-{=3^b3n7ZNM`W`NidUC2t^xDP=GzB=qVz{I@v;x
zpaGJV8fJi;w^U09Mzv62R7(QsWk%o(gOd21zk^#Q3M6N9@-Px-s;x``%J9-Od)Jet
zpHhP-J7wZG@&22Lt2Ubg_)!4V>~bRd6hSjG2bYC!M28Mj)9qGYg(6?IKfJTld<PL@
z&v>MtBX~Q`Mtt);*&DjF`qGbRUmi0IWrRD_wZy}Xu{wQW=DXYn+>F;^HT-vLB!@KT
zSB1DP%6{#R=8NHuv?Q|(65x31fz^d3tffBb@t$b}N!185nUKg^7=e>Ir?ER{9OP{6
z&66bUcIgn)T@@avHR6iz4qo~~%wa4f4zX0ody-~LfVUXu1abaQ>Xf_oWWIlGdy1Ns
zlS9{Tdg-$|pi267IX#`|S_(gj5oy8FIJu_{oPE(cp?Ku|q+PYEH~;<HH+ilvt&XL{
z>aHw8@a(U7KP*Ikb9^x>`toYVY0|s!w`lsHG6QoX;rLM8l*WEGcESEleUEIS;n^eh
zc5a6aX*tGuh$V<F6{yPy6{5?L$W#F`KttqGiRwI2zXKay_q=bpmd*5;vtbE7y!!}B
zBVj@95u;$2^@RtF6$7CqFTTTOM!J8r5IOV6%o|T*x}eP81=H~sNWs0ldDz*}CYF{?
za{r{f>l1Iz`BLc*{bY*^T&GOhPQSYxU_11|7Fs&8t#FE9NPvu_Z^yv$KLv<*=H$0G
zf1iD#n(CRn-UtQJ>O~FjBoH;Lr+oy{9`-3I_}92*`-a<~&fTf9XLM;yprO5r%o#zg
zE*qxAoJMqYH$!Pcm5yx2Fpp_lV=4sAI82NE({Kc%gk6(Iey#;OvrG&8Jwv>ZD>D1`
z@llCHZVG@CAmt;MGIXTzEQH{gZifpC^b?`A!ZY}2tA+&JCQiZNEm^BP^nsJ9_&wKH
zJ3A&z8A)jT`KH6Iq^N~Whw!{LgqN8xvOuhd?_rnbo5e7`6GK8l4C-(2w^!18l$atD
z#=CW7vAj8DJ}q0FsJEM6r2{0*g+8B{zkR$nAX1p)6&nHy#e_7yuX86Hfh%-iwx_1U
z7-QtVr2B@{x$Sy53L6!p1GAc&4~=*AOOtAmOL4F|Zb&_QpHk9-rm1fqjHl>>I`pvF
zYPH#=mp2Sntr}V~uZf?jXVGO*K_9<Uw<cV8TlQqhy{`x(^*39cPz)sgyNN$bZ`*uA
zesp9N4!5tV+#_F+e_s5!JiXTI6(kG7Eq!)%fXqw*Zr@N#pA8w9f});CVy5E0%SB3c
zD9E(7k1I)yp)b7SZfLyAt>daGZ!ACWfRur{!44t1SUaU+Mr;t=(52!6?kXKoq8pYJ
zyBL{cG4=gaU)%WSR!RNl7CV?9{k*;F+sE0KH}9M~ADq07Yq^}B0!J)C0Y|;4)!~4i
za|z4I=l1uNXXRcT>3(!X_h>i|aT3ub1@*r{vkXOV6KwI8Jx|o`uV~()0G{o$!@k`o
z(~7T23`^cRUETZ9rffOE-fE^EsEa+4=Y_K1O9mOVE22xelLygd9K*womx&;9zq|~;
zfUWHvN*Sz@m+Z)$H62%*_0C1{uO`eCpK1__w|X;L<#+SwJyovk*?0nEQsT$LUGLJn
z!JA+C(aDu52^lsDwRg+I!<Rm}enhHJfTb)1#{9G)Zl9D>O+E!TN;Jmc%Aal7;&`uz
zRX!YSZn<a3BqyKtZhO&t_6e}kN^&JLu@eqOTH-XY?C9qaPdBem-4vVH(YACg**rP?
zZJ>N1Dwuz4W;IuflW|`|Ts|UOK{Vo2CAOW+wgw{P71N6pK)4|pW2RwT+B~FEtYFWA
z+{^ay?}6KLc=)5n=d0P1og+NmjxE`nhXlj~`L7sqOH>KWk=xZ3OfHT#ZhXIcHF8@)
zLEO76*Mnn#`&w#)st}zujl>h63dRyS+x6r`!^!m6rf7w$^Vf%5wIdj{#H-xWDh!vm
zVZ)F<Gv-B*Z(IfuuU$AvO7B!%^pU{`F6rF&RlvoUEt>lK*ZA`3n3T`L6Jc#5(q~If
zDs*xeW?=ikH9)$vCqgm$B$gfo#-ac(<y7*7Eyws{)m|-6q$A-hb}v8rV(Vg~QMyE&
zQCbBzH5V&PFy~P49!tTb)7R2+?@n7>9Cr3#S6(?Pl?#4%-y$TnZU6|e55~82-;2zJ
z(hWaHN!F=BywhI4i(M<EO+T)mzot?WKqC}w`wi428N?!Mdo>Fh_R)mZ^xJ&qCKW?i
z@$;P@YJ}+HYuFmc99mhAn>Y8o>6iP?fV%`vwk`P`w85-bPziN$->emD*#8Lq<aA(v
zZt3^=;bGGJ*sU;?E{QUfFtLKM@KOf+=Bk#$)140_#iC?c18xg_;Qy5~#u20mf5aLm
ztWk=Tbh3b)QhOboK2XJeD~<42BuGBt2UC9FJWEK7BkbT@&NdU6o^!FB0u*g6AeYji
zyUCE57V=<u0(5VY4v0y&(B}rglZrC=TVMKj=}SWhyc}rhDAkY`7VA0-G2fPZk#5-i
zgupAHj`H=~qUY@MnE-0&ah{p=bGrs5^%X`IgWa~9`rHZtIWz`bu&(aKhV+|o`WRtE
z*Ew|4NIZfMF6Y&ZTF8njxqQAY9?kJA%3o)gtKwKsf^t<J$8`p`o&qo{%oRB+i@?j|
z-0Bo16)e}@A@X6~e^xDvjP6Ah8bNwBa)R2-t_22cv>i(qA3`h|EbVqpuENHI9eZMg
zr`DPaoL)4)iq_)w4S#a%UZ0psTYUh7FTM2@14dbP7e}kF?aTwaV7dzJ3aN=G2wr_K
z;jQ3h6QsWfEQ_=hpj*A#C*lbwRte|di}GkGevAHWbhqu@T593E!Q6^fYJAwyA1&y~
znk^{C27;IKOoT2KL0Hw=+aaKsp;L<<<~ld={e%GhRG8zl@?@I?KB%ldE^`*cd|;8#
zqcK6w0()n}NAz6}@ghsVXr8EsoO;@p<dOP`XYDe_hm7&UK7Y&4Jn5eaebC*8kpv|A
zK9ZamVn`I)0&`5<F)ie`orz#-+Q}@0k($Nu3p7G=5ff5Ne3a9@NRa&jA>Z4yCKwa^
z5rdE5Alt1|f>D66Bb6H{8Q7UR;xs7VkRFS8&|bl7a`jMv2CIm=jfTi}W9+f~S97BW
zbzMQ~?d$O~*R$+7j0cW}uJf!wUbGOoEy+wIf)<=<9O15nmkErm(&2(z<LzAni(s?9
zF~YuO6V>P<yXE#|$7o}CkGzW+LUX5oOfH~y^GEA8qT87pd-<CFO1{eYB|CoYtDD>p
z#i{vu77j7WmZvRU|B3HNK`d$!odQlF11Xa%{$;VjITOVY*WvB`Jknsx+=-(xyd!yY
z#AmLDtAx{pDjBV&*cFngW0^~I0hbPR40K(H5kPb)kvUq3T%Kg+YdkwWN<47&%!9pG
z=Y35ow}fG)Z7*}zM2;w|W~CkZWOPTD%OJS|{@NU2*EHfw=ERgIuO=&TZ)E3YzV2G(
zGcW1lKm57|6Qpac9YHgevL~FL33W%Hc9ub{KzwOENjn7NF>Y6nSwyXmR)q=Mnq>~F
zGY)t71#H`xmM*c&=}+$@)dPJ9C<VZrt)1P%T_L3+JZ}?d$scp)7TI-VzK~U3j=POE
zMQeTN`0-%0<dpke5p~mb-;UJe+v^gBiO`RIA;Zf`%$j$&`l`zugl*K_92bx2e5iO$
zP5W?;ika%j7lu+kN4oUS5i5&HfuM*@E5KO8?m(Io`|{;1CSvG_5%rCrR%1K1UDoE$
zV0ZhiHNlQwPbf{wjU{LHD5FX|Yy`A6U`<HS=xE9Hj02l^0>hY^8m#x746mzn^jSE8
zmGhpm33rgp@>-I5*plSqc6EK25ee^o7S|w*ocuoA^lGgmJ0hd%#oJP0A$l7d{-d|M
zXs;!|`vBKmUSzRv&R<Sp@jp5LDJoaaE}L7$DEOT<Aoy_C%+m0972-qz<{EuUX=LCs
z1A2aWo|&$>?ghTMXx6d2ApAs8thK(#+ox2pv1;y1owr|r3_jui6#@AZ&;KzY`SG`q
z{5J{7-`>A}>HYkl^oULUCf?t~`#<S&|DPn@KOd!%`@bs|Q9EEZ+d^i^BQTR+Be|CF
zY^y1ELk1v+*Yq=p3QO%&6+Yo#-U`E)#`IX-?|tQH?DUw^m@nZX@WF;)M1h_q<RSQ3
zQU*Mba6NKtMqMcx%BSsHHMiBa{dzGRGa$y@Y|Jjl+#02k>Yn$6=j?>|JG3M^4M`SD
zCOt%aSs<TAEK7M)fFW7vigfKvBKhvl>J*tXynt9=yAhr|I~|qr9-#)qa@(Rob=P1Y
z3#!pfr_9Lq-~$~T`Z4dIdw~cMB<6l21z5{Mm^50TwR;|uy^#O8vEqO3keuUHJROy&
zMI1%+TEH6ENrH<5wM2<+=z+T-QJ;r!oC1u#B7&A}_r-AvkZny;95sCL?NG=;5e2wY
zg!F*?&_t%g@Wv>_?%Q;HvK$W7WQ(KmU{oM3A2fF5D}Oi@rMr9<TK}g1FUAV_XOwLB
z+Vw?x-f4||PWk82{>$k(o%k@MV3_Y;jMnO(aDt6(b4RluUx(%@{fogq`)8bxr6_5f
zh|BNk|6fw)@9F=ap#C?t(^6!X8w55)*I6A2XE+=FT1a)|sTJV|{>qs0R6m29ia{aY
z)_3RYO}OK^KD!y4#2qwm)22_M$PUn@FG{rdkmZ8c1#!rhfm{CfP4I=7ffbQEYSljT
z=OPq*wX&X|Yys77XEpzlr7O33UO*>axt`c<=i>_n)pn!qKZAEa19<;iXNMFnlT;Q%
zOMU}mXo9698WBkfaG7OO9;>-UCpY3qx1o@ifU#Rhx1m5o-50s}$^*~@j2*;7&yv8s
zxby;1$A>n>#&SF-YE674>aX~l7~ao|CcILd7zXlBo>a;-f4)>wFSGo-^uwV-=+ef>
z^pFib1$d31?^KU0HV%#%sql8@Ysq}S#(?b|K?w4ku1~)4oC5qSwo~rv>RvjdSGMt@
z^i@-vyoEkzoY5#fkF^2#9x&hpb+rw-=iSI}t_o!T=BmGi+CLPOXJCZpb}7;=Uq?K8
zh5Z&xsIihiVPi*;DCcuemw)rCL6MQQa1a~fg4!>w<iHhog4BZWP?}v|B0_`^?t$F1
z_`I`GIwRNW9OUv&-7Cyxc__gmaY_2<asp$IjI1uyc4~K(Acp7bfz_82h5E1sOPa5@
ztUkc`u#jc{+Ph(<yrb|+<*J_OK*lF0a@H=LyI%)TwR!Y2@Q@5`*zBhpL6#;O|7SAI
z&xD%)zfTAl$Nw(`1tOJ*yi;V0V}!Ga>H8ql`qH*vix(du7A4~R!TcZZ<#du}*BMCh
z#Gg?Y#9xoG|DUOw(_p8_EOJ=Tt?7ixS!zj8hvFp%5=%aY`Q|%%w4BaQy`I1Qs$Sab
z<0)_Ji2=C)`^yG>0KZ&*&z>*N6RX*WJVJg?X25z542y<B1UF9#$M?S~e~N1Uk<wij
z%91@z`=~qMn9hdOk=h<J{D~dh1PRnXukOS`4r%KXlq2f{nHILLuT3_gKj_j9%Ud{p
zwI&fSU7-%yU4`DPDaI|5UQhsUXnX<Y3<aPke~h8RUO+|kp0@1nWa@1Fc!^)cUQu1W
zLDQ^m)$7)SH#6OUYQJoCk>rnp5BWSqU7`xU0gSfMfIeIk3g9LzwUUUY04qjB(Pb#>
z#&{Jf)%JN{6~6pZUHG>JasPnvO3w3!)OW6&QqH;3^aFI_)31WJnu1S@v(E;0JD<d>
z5-#|ptgwZ2vA9N+Z|=$-o1ZLQJzp!dOr3DN(YyIft0-pRDCeB~E!7`^MaVm+lxC$s
zhcW1SjwHR(5si5dy2Z`<1(jkQ*yD5-)bf=BbcU{l2|rkYAgKkPuic0>4uZe56t#T}
z-ID@C2lJQ#BDV=H5^uWd3&YEdq^aFNm|&{jtNGgp?JJee^t)y6e*WS&&jD+bymPUg
zzx}Yy{ksW|evIoQxR5O1s7LVfwM0Jgm4eg`e}YKlCNvhw+@TcQ9A4MB<1<@Am92WZ
ziqq^`u(kD?V77GDi<k3>ga$I>C^%pyQuQjSf(S}}+FDTh-(EljMDBED3~OZ7T$5-R
z@Lt=AwYRW+Z=3XSg{j2)Ud^(oyv3Pimk`K!ll#{7rq0{NPFp$2!)y;!E-nLeHtO<c
z-qQ!2*nsuH27bmih}<Ku7cth0sgQa@G$Cj~vwgDcC1xa68>-ElF}u;Sdn12r=bBv$
zzo>Eg&Zpf@|FJdQ+ROOo1Z70m4KgeFHI$nehJ~%3F8y50obP!l{yTfKwzJT9=+MUr
z(Ucn*=d!Pkvemt0b$!3KIdV@ceK8#`^|R9@D3N=p>xme4<n;|H=P`e6<!GMxG2ai4
zUE+iCF;|pRqEl|#3I|)QH^n5|9t?r2&q3xublH)Q;Z^(ikQlQOVed{(Oj$v-Jp~Ay
zUC>q@3{y2MxYty*-D{HGag%Ex1omb0yhQ`!EB1a05D#h28G!PD&G^SilDkop!^L=-
z9v#{iEL)tMUuAGkrpCi^)kF5sG4}(buczr^-q?N%#Atk87$P6KhF^|~66<4yG5I^q
zzp=Kq9W@3+<J6rQq-=yA0tF*8a}m6JMN~*nYIDwz9>Re!?{ceX>AO=YVP1pmIC~G{
z@R@!?B}g_w`TaWS!w1Qj*-=<34BSxL(S$Qw1sIDF6c>(*RMx~hA%!V+j7;nb7L<!N
zC5ya_iBJQWE?ixi)P5g4+rv1`U2Qfih-)T2-3Il)A_NW8#cU>kj9M@a*%pz&&UH(#
zKbh+JSXIHWb2UTkTl3qB*MUke;Dxtj=NUq9lF4!hEzl+;7~CK;-zOO17nc)>T<QG+
zu<pP(2%9<-?Qb%mc@pkW{k1NlQGU>y^)1#fH8B#WcENHeJPZChS%P6NLq7&i2F+|{
zFs!i1q@=C4Tsv#I5`GJ-cP{M|49#|a<UDzJGBr*U#k+BpDM0+f(G;&tW0iO4hs=vd
z;ku-JFb+`w>2Q3ud5IW;sqONoW@*nFZ`@(~(qsR`tI#^_h*a*jt2<YHW2ZywZuS|$
zXyBP0`<#OuHigWjMDQk$76^5iVxw{Tq`JHD`@JK#+Ij`tzqXWB1vzeM<aoHSy-qk@
zQA>y&Zay*PmJxf05D#xzO`ny*;lbT=mprYbP0B(n<t~Vick)F_EepTPg(buNpWBxe
zUyFh}6)D^INT?WSXkV&;54Ca+epYn7ba0(+WP=_Fa-pUBu%Bdv+Ui3<LX}eYHFAy<
zwXpMxiH#xte$8n4r%^oc;f_<w^o}$e&f5eI_%X206rupu_73p~;R@XJ16c-tmS61t
znU7b;-se>hYNsvuTpr+xV&e9p|DgTRe|6nF&+J<rxYzp<$;@o41$Pmk*P<IXoxQJz
z(T^YLoxi%}gso|1m_-e#j0o}KcrTYs(SFTH?HN30&Mwd0*myP2m7rUM%uk<g$K{d0
zV5`RtUdF8eb0K|ppr04UwSqNF6zpB#!798ev9YC=?wB7P5|S?V<yWMakbmpyRAq5f
zWRVa=o+)T5NJaRWa*>&e!QpJY{IItu?DQohGO;g4`lBns%2{wcLacYR>znwTmRt8A
zcO_xEzM^W(pw%!0w~|bbAnIb%$;>j~z4iDO%re3ck}LolP>O?aYMZz(9jJRZJ_}WF
zJvY%Lyv^EvR55e*X4-rE`Q{q>kq4T4y2h2%%w%`O(%8;{9_}s}GByB1xW$~vvWkCE
z+E|R`I-S0eoGe|_aP)t`V^4GN7}Vlw5o>%XPf|Kr1WYpgPEw!x1|rAc3}!SaHI5J<
z#M2a@Y3qk?&G|)+Dh+1oqI&s%*GpsM4st*`ibuTdN4^*!-6DE|S)^OJ<lKK<{wyF4
z2Q32!(4P9gWu5+&qWZbK8?xK9o`gi-GNb_Ir+mmcl?R2GL}GQ+N+97V=w#b?k3yDi
zE*JC;uYxf=gv3w%4T+bn-F8|=qE>s57*oVFyM@18pY3+JUM0+-pA9>qu5ou;K{TU_
z<7iTGrY%cmJ=ML6*Cn-2HC^{;$Sm^&IC&I;F+<Eubs@RH+%Ca(IVUrFEClB?HCt`7
zsc{oEti(LsL!Wx(_Bb?g1Bw>MruV}dmC&vj<Z61aDvfWaEaqeWb68VjjFss%ZQ1wM
zD&etFub0ACkntmUxe+u8l&$`xhu}MEM%eM}I6OYSn0NsbP`C21c5fCRC^eyk4m;tm
zD1JPtrh2>luqlhG8lodr+sD?GcaiW7Yr6{VgU)P`*|IRme)ib5`GG`j(~?NJu0_`g
ze@6R4g8uh7PX9OJpOX#r2J~!Ji;k)Oa4bS5D9u!^dJ_4-jLBAlwpT6!J+LZ?Nw~H2
zkUqZ(pN8A^Qw*Djz0bZh7{=Cq@tn;Lvzm9k^5jq*Kd29|D1l?v|EainB96dMf;8b5
z(|5hcndpF}E6ek&+B&1FfNe$2`uqEHWbiV6Rh&LZSM1%L{7^PLV?Fo*KJeXDAHfC@
zB8Gs33n02g&_q>&1u+tz1nEs@@YQU_Y~_aLAW+%a<604pS{?XHj)BU0RHvVaA1Y+w
zSWAKiBA1?0fR@g>RRmtNxs?z_9?abyNK|KoUHlN}r?~B8+7~w^(3D?2-|FYT$a18*
z<aNJw9V+K&2&7%j7Rx};^nqf6%85LDIFJ=e!-er#C3vNtPQkP-)20qWSbtt+WTqEP
z#Y7^*F!;#i_@IqPbNVxuyTh<xP;z%JVyp^Ke7L*V+7yJ#2nu%)hTxBs9P=F#c33$R
zSe`Hz+WfL5KjT7(w&Lcw_Wk<%`FMH3_*+Kg(l15Hv9ZZyO^}5b!&(NmSFz({<_{p9
zUO>zYtoY}2u`!rsqK3mvnhAnVbEa2X(j{`lw2s-h-y84-S3oW*N_6Htx$|eTEV%jb
za*62gt0Y7hqy^GD%(tS{gZIlemHigl;?S^o;dK_5v5<<q@=ZeAf~OV7a>UhEzxWVb
zW-?hrM}<5Oiu)bB><%cvR|=*HWTy1h6f*!!->nQ%Y-)PsKRo8pGPBn18<*^unJ#w_
zj&;0S;+KKlzdr_b-I?VhGpmCf^#EM@%t%`@6G#el+*tRP-n)<i+~`Q*mc?7e0CC^-
zEQJz2E=CXKM}?V8qWLB=T`AIhQEwDoClLWkKRXY@@R3<`G3z)8{zg>kz!H>mON6uj
zRdO^>_NTroCquUWnWVRwso{^|O(cC=Og`9+F$KuViD9Jx(*u}&3Sa`t{T;j_jL1W?
z5`orV+F^pzk+RN3K={bV=I(@!%#}`LYqWn9mDLri*KT;%EzPU%CZW}ZqSF+vlGumB
zztpZs;gUeTT8Ixss2gwLO9+Eo;Agy7g->?-I{N#b$y~W{2{og1#5MhWf{E95ci;@y
z5bK>S@mqq&uTBz533iAsHyt53H8>Y;cuyP|)1P!~e!o}F*AA{;<g5W*agE?=bxgPj
z&%4GQKckHFvX*r+y8J4+m0;`)As7>5Fox(_j9?$E(XjVU&H4wTxQk?A1W%l=!n-69
z`Zrl9HO1bWnkQ?YQzO$xX2!c<m|&%C66Pap7S;vfwe4<m-yFWNye+%3t@h=+dg{Y&
z#~Hmq2^PMu$p%yHn)x-xhN`wxAc;AFucd-uEKe(unI=o|vwaYbJ^K!sr}*IRsTJpZ
zwzgT*%FUdXj~|L14VuT4a=2?ALA~N!Ok_iv3nZI)_Miv~;1W0+xNJt`=2{X)Zu4M#
z%_bX%IX~k*cU}2dlpt7m>^XC971zbQi!rbQo$k)Q+7%Wu8=8~{QhO;`9Mf`>tOZxc
z7nW@xsO7Z{BS@9Y_KNE!mFaRxxbEgaj)gv>;qJX{*zC59+^IuKdY~oBluq!22azfe
zUg@aB<;1>6ex>6&TzK4i&ku$f|6Wdo2N9YXvTt7(D96s058n(a=)uV%A42gG(B4IW
z$VVyy83mGuEaP*}>xnRCzjyfTzIo@%6KVWnuO^0s0OQkHR=lq9=El^WpTXq|B!4Jq
zPTZcV!H?kTvGD`MqoBdpl&&$byq>tw&!qUZFnu3^YT>F+YR*1;36f=3*Cm}i#>S+I
zb4dPa7{#6d-})tj^hN;0^LRaivy+1v)_0jbrG|LAhLj<eBqE~U(S0IQopD46TTIte
z<#_AU%}7_=EBtqC0ht9_OCE%=Ht{sz9Wi;`#~1L`Y!5DxS{x?;-$<KJu~;hM-T<ES
zYt^Ne8LuzD0`=&#$$eO<J9|nD_;3o)*D2kW)^lO3y0pE$_2G6J|Itgkr$pXXFiE}O
zAr8fi6bK-fst_2v_R|roJ7@xycL*1X*K>U~Z-;lc)K9dURKG>#^7-<k$MMraX)a-R
zG}ahDpYAnc-#nCz&%+sE7Pr@y*=MFKK_F@c$`;wOayoyZP$Z+$8#m(X8|KCELnvp|
zAfw{V$jdhhtNM1$(sVtlG2XtAhCrea)oLK~W%!i|gcp<<Z`gNFSA9O`)*kfG?7TPq
z+0;SbW8bG$*6tXk9M5tcbPs6>;idUOUxbtf>E^*A#=J6p)&$3zj?2J*R8Uj!?Y<B5
zj*$x^O?GGsILOz(dyAN)zcpriF)5P)umzW~Yw<3aFUO9TKU7%*zedWA5wnXKS)|$9
zzWD7T=F@<2y2VHJ(&o3-y;~Yf^k}(*-~l1G2P~@kFFVa!Ndj;;tj_bLf;ccyK0D4j
zF!Zj-58PbaR||!XLmL@$?uI-VO(MrQt`6Tq@QZo9z(aaUJ~8O6l%#I*>LN>${8NJV
zh31yvKFHUuqlc5J21=dqflJU?+hru$345v=+B|hP6e_ZbyY5`gn^E}%XXbvoKeyS?
zEvF*9Uy<c2otSAw1yjb5F!JUi#$bdN(PhyIW_5LG5#^SYW?cJGX0JgO=p~R=$&aPQ
z9pAd1_UhQXZ`x0+xFm<{{PW}N8Qw(<(Nl$IXUBGy;~e#44kNn6DS+<|s=Xf~qN5Hz
z8`0d~>b<fk06*H^=nFlJa!Ap8W;gd0(s5TjiAwiVy~gl62`z<|kYJtDq~}nOOG|5R
z=dJF{fLvo|l|?CDgb9CI3I|Ko_KhgltC*kj8EX)2YG^88A(SOpDLm*hxVrW5wxaXx
zRucUvEDq98u+2yCju|XynMy=16(Cx2?xJ-hHy|91r>YIRbX0Deq#L(o6D|&!x`wez
zy}fkQAZSrp-c@ksaU5x}Wb$Vk)=^M}y!qcfp$Z@oH6wSElu%sc-h49(pg;8r)Ho1p
z+|Y(V(tR^l#6~AG28LItB^H_Bz`V3NDN8Vz8ny$&7#1Apv!iE0Khby(<VRraO{n3s
zKVM(__p1i~kBqO=9Dt1Lb_?{=d?&|GIe7z2;_PmR;lT|>tYpCu$i20Rupe~sF)J~1
zNX$j>!LfESv+%MhOIJZ-d^Kq}Ju%?Zs|H>^gVDJ9^6c5<+ZioQK5$#oEu<&nE~KtA
zkHCcA9K*qYjgtZQOK0_Y#tlPXcy*dLlQ5rN9Da7yaCq3k=S^Nf3>+lgmV(_R1SlM*
z4Z$4Iv4cuX9+?d!$@@uo(2}~kmQH?<OYTRyrO%vNmO`6h^vp<4$P9)N><N3VHBv2U
zxWLBq)nTI9x*a!-7J5_nHpY3r8KOZ0fc6n79lBJ|DTZZ80u|9}s72apFV47pu?EcY
z6OZI*p<{jvtk*tUNppYZ=!z-8HGmBIr}`!dQbbt+hqg8040#YD912TBa3oY;NA<FW
zu+Id933?w9S+4q&$+Y}!4HTKa@wqZ8Zj5MSooKL6e-dbf`lZhn<1T=^!-ZW0pF`%Q
z%@$$l9P3c7f;k1czns{vmCxjTnRRZgOO7`3Q_jOIBkkiXKU%RcGFuBc1Q`4hK2fum
z1<aDf)X&v-v^mEkaQ-J=XkD$-4(7idD#IlP=vX_DbV(V|tLZbfI9|L*3^(~5SukC9
z+~eM5=ksn+UuzYq<G*?4o_)W4>-C#+Uy>{PG>`eR<X939zz2vH>&?*K1an^zS-cj(
zmgqc1J{&o4;Z>@4?saF~GDnz<jq!bwqzc_TC4P26o~u>ux~N&ivs%#4OXMsjGkqtV
zjG3N-96@>prh$3`2l3jnBU90Y8gl!F4&iO`0V^<ntT85ln?1`!kzZ8%3Zx3^H-Z<1
z5xKtM7Vx(b*pL;)lxKKE-}RcF=!(d2wEbYvo2vFt$wr@BJhkWw-p=*$Nu=K1hQ6pJ
za;D=zJu)_Y^JMuQa1osZzN-&=%=qo7o1Vv+R3<4rD2~#Nv9pMH8d85u;`^zyH_u1V
z2||XY&`lIT&OUO4+^qwW`nc=V_Z-4?ysH(q1+9!N?#exE^J<dFv~o%=Xx-<L=lJ?v
zm1x$C5g%+I1ENbFG_@eqF|(RTj5+(ES)Xe}y$O%DCs7WVregI^^b5M5cwP&#^J{b9
zZNE%jc+z7;{>eioM^LAh7<$;_p*%XmL{n^8%JQp#eTVUyZNKl1(^Vy{@ylMf&q}&+
zE>HO~?wV)K(eI7mGGMm~4nvn_K_5CaP3PDg++-}YI=Z`m<r7F*9$&5(%%H_{!dKVC
z2gG|z_KZGRU(5hsxK3C<mwWwba&sc~8o`P@3=sfHP6>YkvVug<V~TqbnB*@pG8l~<
z|5e{FxEm#SC(8@b&;0l0uG*K{iShHJ2Er|apd}E@d1!K{3@I@Ny0wVWuSLnOyFGz?
z`;LgueSqE0bNZlotnK9kEd#g3Lj&$YLh{dVicuq-Iy4Eqa28C#c2X^Q{-i=RDv|>9
zq68wZlU;*`{cslG{{|oml|}wg(+fhM46#CbjZ8-`X>LQY?5~l$WYJkCRAlk;@Q+Oa
z$?DP(+^IJY55vo{Y@=Tr9TZ8q48HkXpm}EP7f&vC7D&f|93eI-W*CXNlrOg6wKp{}
z(o=B6jN8Mjzo0VFQAR{U_O-r5L-sw-49?{{R#D(GWC!&T>_OwDF`dXAVFzpW#`+p#
zR=etl#NPVv&GT5-3UJ;Y+6{0!Zj>7L?K9heM8OmcRf)63D-k@2hbIx9Bk8%mKF&gu
zC?*NJlH()X&13R)pwsA^!RF9J*d?nqyW3E76G53MMM~?W|AK57>5{Wo(AIhinmQi#
z)5gl0oE`7of+}CNXdT>n4-CtFIpuSM0XDQw6m);?y+{LxVux@%_`++#<-I!$k+`qB
z<b~zfxdlgKT8(bI+aYQ$+rmSlVJxzmhL;v6gkxYaGYE_kbb1`Oh|ek~`r_}qJjWyY
zQP&3`I0}$Zdollp+BF#~dpo(}2MjEdBgdtgAy-(=-k>Vpqv<W+(iVTJUxU)S;U85Q
ziMZ&0aqB{Y`Mx*(l=LujK!(lP)Mw%F-ovV?w%?Kn%y0>O7pOcu!|Q2o-zT_$#=2tu
z@?BfY2?aGMRiu*nNsYX6U0=mM!sUBZ3KOqmElN*i8J&7D%ZJm$fjb~JB<9`X3%JAf
zdY6gZ?5n`uF^Sx=*6Pc7q)VR+Dk|=t0?u8zO8{uDNsU7UKyOqR5_FCaVBa^bD6y9(
z!=cMN98M{(%(_qNw4P486&%hTiDGPiBNjBWEC2SemJ!Y{p1Pg`lS4cV3R}hw&={jr
zA=SZ;VMO%vy03?k9|d07h)}>^x$l(3j`EChdw7snk4D?=t4EO8u=u3qg4+;5s5kqx
z(u%^kUF5zocAi`vNVj*><k_s->>JEFy(J_wvw<qXIAcogam)(&Jrn~f12c&@6a4cP
zWD2wa7f%5g=ciKr8i!c61Z`NZ+}k*OM}he8Eb(rm`>a;Rbc1`E?;8r>55Y^+qFo5#
zk!@2euzm%B{fpSJtQU<b&L3)K&5*mgv~u;gg&tlmdc=`jd|0_3qQU|xD!^n$Mvys`
z@OtD$e^NCru=z&HFs$)LUigUS(EA;wh@p1DkUqpI2qti!_yC4l2OqrvB~7ts?YP<W
zDw*j9L5ke7pfX98CW_Zhe1{zI{g_g|h);B$@R$fYQqA{P=i<63kLKCb@TU^bXt%Ab
zzmLqM<K*#t1hFk=?Wq_R)I4ekD|k7t_vFjgm);W#7vBx9eI+D)6H-!ZY)XjHB!5Kg
z)~BOE8F|(D6{s#a48y}fO1?2l)CGMsha$gqL-=*X@#{<5<$cIQ-obb?CyYmZD3iKe
zrJRYN&5KNLg?Y&ZjFB}5K11Busk?Tsa<yQ(VF}g?MNM_Vnvr+514hz_R4!p<8<qn>
zg8>RV;!mVF_4e(yU*2a(b-T_yL;^j`R~BdTaAdqEUk^LVQkOj5QlC*iUhDr!WB()9
z$!(=TmEBhUxl{p7RC`8qjX)+(`g@oV_?V~8Vf@;jlhi8TwlEgii~IQFF<)CuK`p+M
zWi|rY$R_VEWI1!#?VRlu^FT)aRMx4lq?mI=T9BrtWBMro6<IDaaMgAe?z9Ry9I4gv
zMKv<V`DKTxR&0rnOBz(>_;da<sa1KAa=T@VQ$E>hvB*a-yjnZioK%dM9<4!q>8;(U
z_@}aJY2vvN*B@7llI5!2{JT7`|FIpiCv<6R@jxH<2y{z>wmlN}3Sk;Xe2eIsLhhz)
ztC9yO06Sv)IyrYMv!&OUEjC%YD&@bcNB`4(WOtAo7s7VyYSGHzx*?8$?$@@@$g<>r
z{X>OdA?O(ghfuTn??|boIe7lhj$g7DTohCboFntdZ@3XZLYiwguD|resz#!Dh=<9u
z&T3<z2V(@URn~)x#Uq7KwsEy#rv+^s5AMISEQf78FQ0RgE#vQFT?soum-ry4<_sHY
z$F>{MyhG%Qz!`zm5*HahaE?^cp@o}$C54z)ca&kPDfSVz4=?$m;yM(#`&dnX(%3Rt
z49x}^wj~M?2<|a&5n#B+vmg=4q{B?q{Xlf7oyEt8J*wV2)jPo{r&MpsACWuPA#Gxm
zXYu%y%p7;E`W|vSi~`7(c|&o>B0)v@;&uNUrOO3)O(`-{#4<8IH>ZaW(^>p%uwqV6
zyKQm}d*4d`W@d(-w7!w}r=8tmgL_xBu2e8Ak~K&-p`NvOpmp*ftJ~qyMJV;ON3%gM
z7-fDcoo*SIOpd;`PW8C5`_YL{tO7OeMQ0nJD&U&VC>`jR-3LiZ1PuQa^CAy{Wi>w@
zAnIy~LXf3&)W}M8^&FX_vmG;X2j0z6gtC(lw}pB#Mn%TFci&B83Rcm0DMe5qr6XJy
zV=2Iksjf&JY9d|i*TtTX`ymTO&hG=<zXcd6eY<yJ^%g1}HH1?jffO77Nz8yBCzWd}
zF(iey+joxpT*!2sa~QeXzWjtf&FaL`%~`vk*J&~`a&uatbwo6x3ncHBDRe%T1xGio
z!%CFEK1p143h6)D7r)(cU8j(3)xjR+;qmf;p6G55vF_+S_-r1Hbojwe@8a!F6+E~s
z^f_~paswat!yQU+xgO7OIk#PB2cAbhS=f|dnxslT=U@J~?v3Sc`h?QV6q#)vBmi&n
z`vOjMjGUd9Lrd~^1=({i>r#3(yEi{BtVpufA8;5l8}$e=#oal%{f0Iabh!mh5es)#
z_DNva_($}MvfNQo$LE+S;$c4AS5j&W8z>}|_RMIAj`+gmT2^0Y{`QZG>cUW~375vU
zG?{=;nHqL(?ooPjHH$Ki@@LcYl8FnTU3YztohxP(hOuQ&g*GcCD0nnua%<Y7<txg-
zAlmi)cP%ZV=91wI2&J`wY(}B0uV%?iTA=B2g%pnrT*ONz-qevHdX3iZQ2-tm(p8!7
zH#m3DEqvyEQ7RH4a#a~8%mJI-o}0jYOEURkClmP{f*QK?7}+3F2l{_(yHswZoU3o2
zOw0FhUbXnB<Pz5`VzVgG@Xp-e@nNK_SUmGnf4LGSzZ<3c2pt%PP=JymrxvW5k(m?`
z2Lf#nMi6IDgcmzkWbFma{5skfn{px1yggxNM={hfD`DO}sLYtD84#msP3NaB&dPe1
zE(^~4eezob#t;l3G7JpEmKS>w$!!t0bTlGWP;xAlm5$R2ZdaE0MO)Lp$ZN(}+;exy
z@Hl(UZxgUM0PPoA4bW2aDTsq&E+M$U-c{lHSbDSMMRv8oZLgyfOxZtJ^h)m54f(32
zIiMI&b!v`JsKkNC%{WpWsT{ck`e97dQF#Nl6o3nExE#I6x`k@$YDczVwwLQ2A30tN
zk38ajQ!RBiKU3Y~%AqJfk}H85(WQoP1s@~-TyO(|o?NybZ&I*^w{JV|+ICC?h|Rw=
z@VJ+s!K(Cil`-y_2VKHur@JpRs$l&)DCLI`RA0fzJe{>j_n-lYI}Tvfwc$|lGjH6F
zCh+|T9LCQbF3!@;)zdH=szv8Y!7?5_km*UK07lRT2$2J{-ZGYUW(8lz$m0D82AevX
zSTN<_t3roax!0=0_>Oxnn}*$I!M?VrSYG)<nrxePH?4G_ZqNXByaW=h59(YrFH>On
zZ_5cOz4xlFl_5(^a0(`G(@O%6+8A3xq&bVXAtC-zEojhup*1Ak0{N*~(#J)C&cj~0
zcPT(hm3X$Cf~9?D^do<p(8$u=VqI?0Cz{IH%GoU48QF7ly~VBvFTwR}nYWKrB_BGC
z*IHTRozpGbxV9xi-I@9IvsE&G_0x&N#`0;svSLyGFPj0H)3Q56VJ!^798`yk@yY~~
z!RQF3Sal@3VaY&rnSa*R%>J(?<(Ukev??ZaC^h@(>a|a*Ksdq_WDiIprxH#R?C@@6
ziUGD|o2q<M;|Dp>AIGn5Dzb%cVfQ#P=Q!9ZaktoB!V`8sI5X&s1mroGm-7Y&YC`>x
zneK6%1%9YdDMg#l+r_&)#oxft`;olGB^hn}+w=E<rw4WQIw7&D#>R3CGQ?aINL~_%
zrMod|cnuu|7<sC#A<{Y8tu<%6jr{Nk)sTF!V@Gsv>seYy&Fst4lI>8u;v)K(FNTH8
zM1w(eBWcuG3b^s7mJJ%=+Xj|GnSyM-!^)T2dwEqM1rpb&Z{L@7O`X~}ln%14K}45=
z+6aen^@6vK_=LifrHg8WyI1v_jd}uhb(384hQ!wf9}G4IEq#Nv3}GCXp+uQ!q~{DA
z>aG;)GUoarM?YKiSOLz@f{}Nwe}-Q9Fw^a6i;)+>1t|!OF&GzuqP0LNM2o^l`>!JJ
zI16C{$}#&r+1Djo%6xKO%v{!!%q{S==au1CLweiAJzryxhXrV^8G>{{_X_Ko5Tk?W
z;v`Fg#^ciPo`}R~SPC1vReeX@H3>@x{Yv`_yE1!5qbY|f8)H?JodZ{(cnvU!6j6)+
zxYgH;3cRy7<oKNR_W6cMqxuwaLDxmWcYM&}^p>{==2wuOa1Mexkscq2kte@fV8T?#
zC*Bko+wE#RmL7KB(T7d^a_^g&x98pOM`{cAXH9-htgeayV~h>RMy1#JP%_iEGPvj@
z+@*LL{~T`le6}V%EQ;q9y54Nfm_PZ*5ohzatKO+9_p{!w9YvUa$9rQu$!uRh3c60D
z!hbI$e^_Lh>irQ@w_I(&W@*|&o^thN{rI(}l3Y6+IU6uwn032Y<q2EN+E`ZY)73hn
zFnI*Df)L4zm`mxi?YL$<*E4)gAR|$1JVtsex8;t+3)ECjTZQxDK>1mYv~lj}(yV<M
zSPso+*+CK$NKJfD?}9rzVtA*Tm@ZgjBge)fFQ`r+>m#Er<At3R_e#_K<0t%>{y+BK
zJRZvb-yc?z-DJ-?6{SKXOW7usElE^%O@(YBWXqVVRMsSfBBrcm8~YNO>>|m&#h6h_
znaoh*VwQeypL4$FKKJAMx$pa&`*)x7c-)W2`C}quT-Wu!-q&k+zMik=YZVoENAU`~
zy-wh^9T8O>18_7H-4>3csga>JPyE#pwu+Ax{s+;!VKvWZJjKf)*M>X2pa{xPbltgl
zq1l?jkKv@5QevKCXCRL^!!3I}rGvbbL)`<fME&efH(vSqy!GLAl~PXiPwG!lO-Vk1
zEm5bt=b<jNJZ!<{%?3A}&;U=*OJ$hB{*2>cBJ8iWDf^|aB-m4%QJyIwYzb)34pi{F
zh-a_e;$*&A!MpHn1EQ#<Niu#KO^&P?Ytdl6U|K|y8E}4Yr+8lf{u;9jl^w<opC3Hp
zH5YUBE{pub!SKVrMOUMlVn`DEDX18x4*gUst??n&PxN^N(yLn!pOm>WTwL``{IV&>
zZ;qrQtPIAGTHl6_xy22hEGJg#De8x8pZrCthvCJt#})$SISgwK<^i<P1Qj=3L@J7H
z(O`F?CD@{{{ySC~8O8m>F*2pIDd>~4qgisHuuFW?j?&`%G{r6Grx4Z8qiZ<^Cj!kq
z&5?wiNa;>45b4bfI}z{Ie=l>E?DMKT?@^Ipm0QN{_}gv{qA7{!rr+z_3?rx&0c<8}
zihI8F4@XVVq|YTj+NC_jRPPK!krAiUE+!7K8s@sW6^Py2dT8Ok3Wq&Zz6SG_*dty7
z49I^Pah#O{xq<CegK9sZqVH6U(W+^TkC2IYvS1Nuqxqmz@`~d2BgkyAl+Lt_Uex}+
zL(l=#HNwPvJ5;B%gd-8BLA~FFhU>uYucw)$v?w$Uzo7AlMYQQ^-EV7-MjY%lyQtql
zbnmD-BYS-!gmRr-gV{`zu%I$kjeeUQL=OO0yAyC4<DWOgpe<f0zHS(0j|)eKH09WO
zDQYOFE8Ms!aN^)YWfC5w)70ZASc%v!cvX}R3Bw8P>Gjp<Y1qzr^mFq^g>?6W(8u25
zcS#xBZ75HM73V%%?zU%6fTM|E4v!itL#;B97wRsG&&_@rV@^iz{PtQuPN}zQ)-&Xs
zQ}XUBZ&ez6XV>vHgt=*}2PlL-;B#&x4YE1C)^<$6eWVXXB@^&n^GC7Eb14fOK2HkL
zIpf5o_6jQ}*EFBK{YA>|cOKo5^$5$&?wXaOq3&U~F0Odp@<`A(jUXp9T}#CT%o0`1
z_q3_&H(b1OJo%v*AKXMw1827n+XmGTyFhW&!2ir>vl4l_QNoF_%#W^1_kX6nvRZt=
zW@$og&$i^)ZFMGIGwQWTbs$m!-44Jj@YBwlQ$y;nl(=|G8<t#o)$Sf7yW-%&uMO=B
zHraPzt43Y-mbym)QOYX}wqP@mYqNkQ*=@RC*%|%l=Ro6P6Jm8>JvQX`fwr=F!y*_*
z&d~IO3ai6bYNj4OA1=u==p0@!y(|~yMmdxH+-*FRp@@A)<V9VDqNJgvHL9*&14gHy
zKfs_lv^p~PqR%UVD#6cNP3VdqMQ5KB6qM>A;|Ueq%%X038;IQf0Ir_vTe8|m-@-~(
z!R}_yLN(nW;C7u)>O|hVbaYtXJ-pm&19i9ABvV%U0}i`#T2222e_Z~`84(fj!vd9a
zmv`VsmfKOAI6#}#RA@J@b|8x4N20}agsEO0yJQiaGBY+PEZP@Kjo~?RcirwGdkROj
zKuxUDS4OTg?Px`(8meh$v!z>o$Oa;xC%lv2=HvGFJ}*sXzSr(asyRGelZolE)#oQc
zOkoBI(e6Lx`W8YmJ!ti0e20}-m@poHQwAvwrwCr+86)qlZ*+^JNSu3;qNd)lxXQB@
zBF@;2&O!pdx=04yij{1*7x{p_-7@GB?bg)Z2fE#{wGOp1FIC%m<d63GUNqMqa?xJ;
zBzBuee8bA73*@r->bd{#J7oVWyFdT?l_<;k$+(U`99uDBun!)ChF`og&Vz&6B_ySD
z3L<obTYbEv&1L6P?_c7OGT`VBt%ZSk;6@T>y$Doe)WZx3k(xte(icv>YMne(;`WeW
zVsn8Z%>hHCH9*m2^KW_>&dRL$45p~hoLv+~i;7+F+Vip2Kh92v^W4~;xVms!yH7B8
zh7_m<^y!@KQ>JKrLsLNlJNE;>v4hR=hl5Bt4~kAD^i;ag;4cD#-A>&8%Ti=k&q5E)
z31jzPi93<IXW|5yF~7A1UtVCMt?|`k)d36<L$ZZl|KWInm_xAGqo{S=B-kDpw%~CV
z55xg2(_ew%T%)W5gQk_SLDlQPsNV<C8Mh1cF^0;`{=?yoV`iH&RUlaR2m2oCHfoJ=
z0Zaq?&VKwYZ=&PaBy$XE18MBta>BQhFy|i*sMd;oHjgcVV#<N&>jd&GlEEs5R)V3^
zH?5bh$}X{eP$U5Lw2q^H?PhNI!$FuT`op1Q{WqscDA3(cKP(giY2p568GtwW=kr7l
zWs61ZjL}174*c^qj{MDu;up`x8$39fjqCUL=S%#%6SezQWGvsc`@_IO^*>*u^+*eV
zA6my+Br7sHB$0eA+u>+EEseUYA{TUfXkf|M(Myp=d+ZeM9|{35lk{I#hrI$maR{0#
zoD~L^`5{J!CgP2iX@()19%=RkD`6hu_CBVg3sQj(k^&d`+jl=aem~h>LpN<b5VBTY
zPiDna<_1x7fHW}0I6bt($q5~%(6*M^xnnBDIWoga{)%<i^3%rOS0l!M%{ccvObl{~
zCy=d#zg<F5DInd!SO?2RKP_;w2n`K7JT0q!yx?;PV3dehT~@va%hB>kfLSe0O+5~i
zsKsO-f>2oIpxVv#pQ;5lDQ4)r@bkH5Te^EslQIMvZFo-_*#1)+AAGkHMYSeQbhn~w
z-7}Vq$^KH2S>)&!dz-a4)ab#h%MmZ{4I*mxHrC1H?ONT>`J$EFLd<<DW;cMVAyTZV
zGW4UcgIX7U3I*cwdqM5$&L=(BaPAFF9pl`>_zWMhszB3oPvmG$=<05YAT@$s3LE4x
zF)()tpt6TqTw<6`sI?4e;j`!t`E?m<cEX(pDd|e+r`rRPy96aTr#$~L%oQK20wneb
zl@E&;F7-=D*LhU{E~K09)<x2Tsq3T~opQq=-%h;5cQh;H=#-(KGG#+$i!sKCvALN|
zP~hJQ8Mg2!>lFs1**vRouDMOyQOY4TH@Eufg`Ri5vJWZ?e{V~Ecl6wi0D5}XE3c^!
z+5V-TtKGp~!P1u7K30T%PJDZ?r?1sU-AGn=C}B&;zpW(|7mcb(YC|Uw#PlJmPiyFq
zIoT>Yv@S&V_hYKLzoH=ISFgRYKiLkW@xuR*(MqMsbXuz+<v16lL7;ktGG8!E04E?1
zRH_E!(8Cx1=BJK>aBNQZuTEnzJBLUAst>lQ5%&M|+QR&S9U)TflBVX#+>|?HHLwP}
z{8YRpoIQ`)axiHpoY4^~=ue`EkX93}O)3hi+>R=Xd%SFW%5>sO7!h_vQG>s-M4{Wz
z+8+*J+`c4&6#1@eS=n-nFnc&hquFR*$tz4KP1nUm{e4yGjmg*1esh{t!auvm0DZY%
zx#Uw>(n2a*B8e#%{)gk#JkTKi!!h!QW8Kx1sfvVkSJ`phZ&8dt9Kgn{bP)<(A6QwB
z|GH`ptc@vK2t+5j0aOIAao-_kC9qll{AbWZGh%`3_Xqw{2igB+l^@ry^fzaa^p7h}
z`aiB1vbbs8w;+kiwECMjk<=P!?#IA}(x#B|G8>r6K52IPx4ew^hagSd-@J(;d2Sk4
z+^@5Sz_=1wpg@$a#ZnzKReRzL6k7l0T}`NM5G*<#_3ar3*l)rH4m8vI0D|p9w0a|R
z=<nVIhq#{ye8eSa@F>g;Z2oT)+kX64y|HY|m7sMdD*>7-Bi6iRm6xLCvNu(h?vFvd
z?1OOhkkhh&_iL?zDSbUM56PxquBj$<=NG=xi<m_XlP=+wQRtfx;VBVTqclt4YL%7*
z$xiOV-$A}8r)gpKQ!-A1za$H-s}42o>-?lE%g285s=MOIe~MA=O%%@m7e=}AdMP{&
zoN^SaVUwB{)P=07X9^dQ4CCejE!DQ()QV=8$aD&%!dU*_dX7^wS2Je{-Q3+(swx#l
z6jt7bupj?e_XaCx^E}!L-s?pZh!><+&Rd48(4qN;XHQb!2hG&wKG@h|ZR1@0t?P{`
z&x{<Bc?bvV_HFJ;D+cSYO_>C{fQKN1C7N^_x5oJE8GuYK*ve0tBF7e>V+87JumRM-
z2H1jHrvG*SQ6zxNwvM5}E#3+dDgSbZrL0Zs9!2_jtY(3|UBniu`fG2CZu;-P_O_%I
zQx23nz$2o>V*mAopa;{$BD}c&=OF*HT>pRAT}8}I5VRXROQ;H^h|dj7{^4Nr8&H!@
zP;HZFd)74x2za$ML&W06US^E9QAn!gjX^&}LCQgKq)xi4z#>a>d=(E&mgQodB#vsD
zwDY^<xsrh*pIcm=rS~u&d;MLLZpnAMR(%fH_sqL)=upnSv*t%OJn;*lB~2Vu<fPf{
zNIEy*xMr36Y#396`#YEK-7M*=*RW81CbV=b-ECVY+mG<i)x_7N*LuLSwgS%@aEG~w
z9RiWJo!T^#Ttx9DCrt#gd45xi&cGRDs~A63rU#iIdX_#bH@M7ug?>h{j-pb0F}ZTT
z@3-E{3X>~607(C5Q|%qYl%DbR4`v*|1k!wad9@7e=FxGPKNq9jpOi|R^7G5z`u_Mg
ze$h|kzeXJjf&+6h0r}`Z?wihkSrz{u_xFEtj{o=WFaPhHBe-q<h&cTJE#km0`o$l^
zw0)gJFBt$w<i&dN+kdJH2`l+C#g#~j_%_r7>Dp(RG11GKB}p~k<oDFaN(9#gDHbU5
zJ!sWm)DTarJIX_BNn`c`e-NfxwgmYRVWsi->w{NETw#L$)%t^nch$Yl>9COh*(AGP
zKv&l8*XXGTPEZIhH8K`Ex8-kM+^gqFh;_%+4kG<HaPh<0wUEUq^CKXA*f!I2SdaDt
z!Tztl=%L>?2FTZF6QG!%{BI5K0q7lU8H_I6G{+c-!AWQe>UFS@Bl?6|uguy~ySkwD
zvsFq@1$qbT-U+(|Hfuy3+uG!~pYtAo!RtvGtpr}9IpjTg*&*hE7mn4<x_(I>;)(gb
z$`XMUZ(qhc#3}K=TWET{DHEdnwnY&;jN@t%#kf$_-a<0o{fGptqGhPiox)ast@riF
zD+wR6r{9?h8%swnDo*8%Z;9ntn^GpB0MWvNa1|X*tW}L!vYWS(W+jlX6$H3@`Q|~_
zi0@i_jV_jrQ-r6rDrU=@pB{H+PT-sNS8$1wP&F|WHL3;O7`BL{4bY><2qbxG`BTLm
zpJrK^5pR-$_H^!CDa%Vg-#8((;|_6a(qt1o1K!JUX6~(kzcTcHpsA0->}*oPeDT<R
zgTl4CN1eM?%3b5FYeycQ<9v8D_iaS&KlB2`sW_&76MZiz7S({)PLTJEvk&2>4BxTh
zz}yaY(-Ls%({x;QWCqKsAEzVtnuZRszRagg!cCuK^XV~mgYAH#)JQ?z2vO}gwK&Is
zBBxd_gU>@{73jnC<4LuWw*=}m&Ta>Zh(|Q@WFFkqUMH_YT_So8|F-Zfr?^j@#~j&z
z)3=mU;1&0_tp@umWG?(eZ`tl1^DrwPE5dYvUw}C2?j`<~6e|(TkvG+q<E2GW7U8=<
z^GRcU@~QYHQW}4FAR;YZ`%^jZnm5P4=Y4E`ue=^TPME`k9v=L3%zS7<3sAllXN$o8
zo$}mNamJ!9t^*<a$wDGE91_y+Db0=Y3B9brSK-BbLMlbeZHBd-JybuQiC+3T&$2k1
z{Gy?rI$NS>v%6iPp!B|(L99YN(+!*-S~L3y$ep7UF}s^9?(9xyL&5`rJ?idh!6o1B
z>bw%}O89k7@8WRMkxGpaQ-F>BN3jfBc9-J+pe6qv5c*G}CCI-)bxJnQ=O(%C`2P&J
zx)e%YhLu5QRXdw^0PmTj1-8N)j5xJ~sW9gMVz|5+O`Y<Ii%wAae9&N~dG%A%zQ|ne
z+#`k)(;FH1J~dc%Hmwjy7n>udga$6&Ikd-$>-&-7h8~gQIc~ncO)t9t+GTUo_40X-
z@7rU_zOnZWsxZw-Kc3)kvQF^)^ex|gnPwqSEG4(|#>K}`Jah4*_z%mlqRW?2Kw=l^
zqX##>NiKNAJVyPf(yhdQg$MbF!|q#Rko3P9)PGn6@b$k;$>YC&XbQd^f{z_xpUa{R
zu-|C<v-Iu9e;Q)}nWgm|Zsi{FA@qAO4_YL5-A0)KDj$0Nh}u1fzV{n)<D_7u9{Mnl
z=wG>p<^gKk79UZ+qNO)Nng&0B9t-uq_+8{*%W+7d-$~Nnk%Bi9BhJ32$tO!&Mi!h5
z7M-2y-eqf&&gHM6b-DLK)$Nl4!QxYPs~!wBZ^w|tO)M8X`nm<L4?0hBwh!*h^m<|{
z8_?~VmwwIlcJXt!)VE?=SR!y=*Q@|gylTQpVT?wXYUJbxJaCaO2jt3XF1c>AT<N(e
z8C3Q1os{Ah=ZAM`<8v|b7^5pBHG#0clOw?{wja@x4vya?R-K)vc9>=EAMDdVwe{pn
z<vZi?>|o9CW@8WrEVIhoM*JnfwYyo}ldf3V!<8$4c-*(^WL5K(miO@!Zby|x1+IKC
zv;&DB*Uh0^z*6!bm_)Jp_(=_}!!?4+wn&85z%G>oLRlE(J7bso`ZZBbW4tT#SC<ZA
z-Rw9w9Lhyoabj30jA_Hjb+&w1g2i~j-~vNy!)rXiMXR)3q3_A5{x5`Hld;yh4Lclm
z6e*`km`tEwr;StiOHbABfOSr`jHb_IUCwdxe3~0$@Suw8K}0R~D!79L0w>dmif8lA
z<T1@?pA&=0P=apua$>E;tG%M{^t|`lN?zl*+$`L^+xeMZ{RuM;WZlR=ZXIZmK|KX9
z8VKxdAyye`2?wM9aG2Lj{nee#N&@hZ2S7Q8+3!Gv1;UUlOZ;MPgGZB&6EcoSb++0G
zr@Fwxb7~XqZ(kGW+GUL6@UR8dZJf8FPa;(>`c1M?u<wfyWs<#kV2n&><X46bb{NS?
zn9D|2i!#wPHZIh6j=-lH{Zy_najS)*u-QA`t)?>iK9w3JiJw%lf#C8^YwgSh7Nm&X
zwsAI>mT;gJzq8loT+<nv{gjHLDANe{rJE-Fi4QgR8MS_j@Q|>i@B(lOF=9?`hrTM%
z^4LNE-iitaj9^5J{`msKMslK4M@TYJ5Mfz&Z)U=&FFX48tb3G_yUaZcm6Y2Tst#6Y
zZ8$M_pw{Ck577I9Z(YY8W)Db)LLQY(K7yuaM_x#*(IvN>yqBrvaY9bYLH>aJk>2P}
z<t6YiC?$%4gJG~;L_pt9Hw0}S^Msfw!+Nq>oVriQ)NH}asqXMVYU|4xM^jl<4Z@=Y
zS>f!;``e9$0^bU+gIsC*L!XVr`Y!(mvl4NYUg5cmIyP$h?|k)po_Waf;_4y~fAH#b
zn0}sm3pr@Y6ys*|TQdMs)2hon4+NCr>m8F~F=*9_Mqkl(YIi_@ugrX}j;czfG2)4#
zyG@rRPW!$1C+u4S)p1|~=rc%?jX@mPZu5rvHBnwCT@3+8?WdIgv=t-nuJ!z!E=Ave
zZGX~2?C7&DadYH*E2c~Vb+r{&2ej5v<a&w>!_2F0R&}%t4B&{(9ewfer`Cv;Q>SV`
zZN@eDqpniV)|jK5{CC+xYE<+c=xP$Y->z*riCFbJvHmX8A|SuJ<Wpz<9d~{g`x%Yy
zV;24gMhnk$9<X)!p8b%F-UFY^dd0fB*=KYCP?FialzG&4Sb-kp;&4bmV1X4Cv2%BX
zCkR&jJW?ETg^QnFNdG3ORErU$RkMY>z&-H8$iqcy?UpiZ5tNM6R8Qq6AFO;$ebyN%
z?$9*FHFf=Ip)CibP8>72ewhtAnObMiE7peG(1ad<ya&_M)^JSmuRzn}A)5=--t~<m
zsHvo`m=~8|ZzKZkwKhgCp?~}7_nTiP{7XdP`<3QLJ%f#D)5>?4%J*_h@6<u2Xs(vS
z7z5DdCkjLJ9&HKB6g8o+oL*&?;a69aOHw8#)^FXf3_iYXv=eWI|3YB%^wHMl5v?eA
zR}DdoEfIEhjciCD$vIs}GNaC<SSYR?P~DJ>-y3zzZEiq?a8L(3x=g4fGR47IJ5kV(
zG=!Ca4QNSp6Pi^!QP3yIQhYhqzpQpYWy$uaVzR?jq~ZRz9>!Kr$Cm>>9CSW?h6%1Y
zDhdRY!5}jdavd@}f^izjFr|)1FMPz;%5)`u+0iPW{#0*|U2@Xpmj!WiTYczGq_0d_
zfc0*hq>Xo)VdcEl;iC3i!5wY0s?f!XJDQZ~_!wy4g2Q}Vi%t394c^%us#s01w8vXS
z!no%F4Ro5ce2*wrDtUoQuZBns%cY+lWeD%XiRC!W!!n)%dgaEE0Ab)ZV)qiN34jB7
z)qgw2nW`482?_jotrqWFli;N}ef0YKvCxou0s7}tuNn^AQ8c<+p?sO6V=UcqpULZq
z4F+4taioUg!R9rM{8ywO@0n_(sUm+&A@pkLgJyAsZNjZ`$}@zyS@ilzVjGeHjNIZ+
zHbES^?!WVUjxGpb9k60QZnYoj<Mhl7OxH_FD{}7l+VSnEaNBdC_J=;hEn=7xv<y({
zhBY6e#bG{&=Oa!Qqc{I>Xb|Hhh&HrS^w4dxvWPui8Yb$Et(RUn%~ULx4KzVsNcfl!
zn|GdJh_QMDJLR>Lk@5otE?G_h5*3=BHoshGsJK#?Lq;BbH76$TbeTkpg3c1IBH{g}
zOHIJ>@KObdNvLc60r-cH2Mdb|XH4l4EQ3O|FTCV;9A*kvAC&e?pB1`r2Ugl-dTmnF
ztD!i;Rk?_HT#~+0E}YeYRUF6Pd{?feY^IjO&u>rSzn*J9@VNZF+f5`!Om*-$a~*7p
zaDLRJ1zX65rnB^gy5pVKT(*ZJO+MA|>9&sJ=^rPZFLm#{c=JSEs);yf%>MY}+s}Fh
zmf3vceS`Q;q%*+h^8o9EdRCDCYzymouwFu<lU<*Kr=zZEP_ZS?3}bok=Qta=-g9O@
zi=DnmwRZQTDwLR#5p?apHl%GmEKFs$cNoghB0o!tPEGnqB(K}?UoSKj?)O#c<#`~%
z`kJw3CZ!j|oMwMO3O4J3bpfFJ-DJbvi-cOu!#`eo#ORxnO<feM<2<6s2*l~J5ZjdQ
zxILcaQ;FGD@08MwJ}e=Bo@^yn13WketJ46dQP)}><?oXDo2)9Fy><O?Tm|pPrqh_O
z6ywCRIrpRqm^lZ;Tkd^6+{g@s`PsHhIKZvQVCPob{qka1L#>w}kn~L2(NhLdZfId6
zd{`b?qKWPvV@CZFKWibIk+0euD48yfI23v?%FOwQFIN!rI+jSN4UL1qy;TE^Qn;zm
z){t}F(%sPu4Hi6pM(V7>KOF7k4@${KHt%FU$UPQrIHYmpFA<3<Z6a5TCUzVI6Gzz}
zb;flG-F=p8V<O(A1)2OSi!Ytq?)foJR|=mFn-rVAa0MqeU%-sBda*$Q>(oa>FhvsD
zHJHj{pu$}+%Qs$I@&nWwMPIyoaim-S#*|jx%(Qcj<hkvM?<=?TrLDieelLZXi>u9`
zj59Q$R&CTZG%sKsrxkwJ&Jk*Ete~>fJn|g7Rnpn+`PIjBoXn+5y#$-o1a8~_Qzby3
zT<8FEdVtlBW5nSBCH5<{ZfpfcfXHcDv5tJdkTNL5z+J?Cgs6f3K*nj#c;SBG;*lSh
z46+=D%Yu)gNIT<)zAJBQ<eN2qpW<>VAu#qVe6FQSM)n1M6g@$t55U}vFpR|AdBW0<
zSMOeaE4xA05&m@Qa$|LKob~Li=fF&`gK&#Y@5MVdM%xM732w*<J-T5F*k)UtVZ1#Y
zkxTT!?xISRkohUL<Wc0g)N}Rm3!aLSM@7{hz2!->-zLR@>Bnsao6AD&H=qgK`NJWL
z(Q-XFRpVHgrXqT8cCatN|3+4PYHifI_wjkc+|d$G-s)dX^kVRlI0EQeu3GN2Al_96
z8Scq>N|0%$PvnKLtGx$&m)vWwTgmhVX;t;mUS7rE4sRbS{chB4Kn0H-h(V(;%v*2?
z33>MtVz~K=Zy7vbTQPV&ebutwZ2tAgH8TZ~DwhU6xh`^3h3+*14hJhh6bq^oo973^
z7844Ck^>iiyf1%eE`J11El=6P$llfKIbPSIQ2TWDsx|lQHMcL8i?zQUXxIl%G~5tD
z1X2mFv3l4~x74EM2U~=HWYn*O`OShJg8LH6ePvdZkc4Zkd<yo*lVqc2>hjF)R*@R=
zd8>b~(TiLBCm}XJH&t?;ovvu9G;fmnlBrD-xUn<$!Hko5)6hb){eqo_o^b=mqO`Ao
zk+2Ts_hxT-@K4`gWis^3G_L#v`4V^9e4CZ`q)gQP#%sF~+qT=UJ_>ld45)8elVbmy
zO*6`!Eu7dv^fnZT3Ba9fd(Xl#E#1j)wmF+$^S_S7Tn&rLc<_1a&h6(P+}!h~knjuv
zYgMzY3xH^aYayr=<<lSEW+#2iC2$3+!75)nQHeM)rWN%oo41k5X%Ej_nM<gz8vDH-
z?BmOSMzpQ_fS2OC{S88~)4Yf4ZrJMBG~GtsdWrvlm_V@kECKs)F{4v{8zkgSM6mb3
zLS|%|(!Jjk%%&!VBf5DCS1=PFKA0KrKmGHZ6n`M94q}R}nXs~2j(o#aL2vO}y`h~g
zhkw+?SSPh@oGS55Pv2#@uc<j%e*a0aqb{Yb*lT>!26M<)wNw<P9t?OqLf=D^Y@6Zi
zk{0|~3$W0W9dx!-eS}V;;XB2pgoOr-pEcoU6heF@^zEi5Oh&vY{ozNHoB#Cq+|Np?
zPS@<Ai%IQOeWCKz`QGD3%=&2MCbtWA?<^Z;aoTdjYgmtdG!-NZ>-%w4C{|h|u*4=t
zrpHlv-<4PE=^J1hN5iIcg=#AP{!KK(8FK<on9nv*MYXeqh8FC9^S|v_PmO%1eNtZC
zxm;(*(Jk?0_6(Rj*EBGJA`#nB5@BCk(Ni0^4!K+H?Wy)t-*+nv|NKcl)N#glAY1l&
zLC}{j`^$4TpCg8PmAgSPF`*8)yEOpQ0cWR^uKKEdE(zg%i!qrOn@PGk?R7}NsONG)
zN5o0rqXVZwx3dmqAeRZ!i>OFr24)Hl@zw3JLU?Dc4^ENX9v^!8^=HBdo|23eAsG*;
z#WRTJm-w&#BvYoO9T@`RGwkv6th*==gfP>T##Q6Nz#jXp9I0kLw9hSV{vP@aZc&4h
z6!5@!NBhB&^j&tJaMZK@@PYbh$loGqC)0q2ysL$ZMvEBA_X`;AqSjTA7H(TLD|CF>
z9lk?s_bQ**-az%9)11Gz#_uoWUOfhUc`ING5wL?q`X$UouqUS?p)<kE4^V3bwBk>!
z31wyA#GyrP9C}#pl%be*cv37@8zV)1Sg8}!Y)7U@S`YvoV%mk+y6s)#$wQw*1vK)7
zlRw5!QE?qfF<5;f;7(}2De^JA6v~SFuAJD=74)UDO9OQn{q9ZCQWis>Jwc=f5&+Yq
zo1RX6Z+P%yDMqyL`y)pym5;?PogU9mRdMonP4X(2A4$q8I{ghbR{=)!pe9|}yo(GS
z#Pl>>hT-346?<vh$jy=A>C=Sl=PeA&A(O%8^H1wFN+cByK8)@CKEJh?j4JC@UQiRq
zjzWAbLfF1^RIMpR>Xqpn5djkOiSN4iGOR7LJetjVVo&V&Fz}|0r{mfgTD{i#QxYg&
ze_bkwh{*fVqE7{CP}^Y6jw7lqY4i9UVPVr@A&XC+UL5<f_tCoL?Td5}nu_{zV82;k
z@}DO>K=HA6GS!>mVF0b}u=g?)8Ky{VL^_6#`bx~uDm$J|wXVF-{u;jPkoi`Ebh9`J
zP`6`yW;GN#an!pQv*8wbsg?mC*grE6V-JVR$(rs?33E>#R5+BCl#%>Q9C>ICF*Bw7
zd3oY1@K{PfBuJeAw^D_CnzH%Y?VASAjNgboQ|+*%Lc87n<f+SP(I}(OLTOJK2<#^m
z7f_)=bLGR!)IA0!sv@c;k*XON?)ZKcoi4pjJg9%yPtjR!<Sj!a9$s!E)+1W|CE3&R
zimW@oVt*syvl!*)-DKtQF!Ae#-Mj0~iMTKlz@%u?mG#TPU*i{0*ZtxB&=QLwHtT69
zM*HsFvX7P&pTF|UpEu0$qxY@f)OHNbk^AJAFMQEgpVxC9J<7o-k7ssmLIQKeC@u(8
z;p6QH)w9(kNSt{HPILFAFT=*oJ_U7Ikp?+OzP4$!=5b2RRK5YdX+;bWA+??}<Yjk5
zA~#WYl-jWe;8dDuhiwYcG0dq|_r|D}Vr`uL`mpJOX_Z1{g<a4U3*V_DW{4qg<yEWA
zsRL2DOet28CQf#G5;pD1*+H{wkJMvWU82^<)|jhmc|M=`(BgdgwVrupy5ek=q2Zi3
ztjy2?M|G@GRbunCGLAFR<N9*6BNwP(41nevk?Awk{`ACv@Rwfw9=`qpRaUOe{Fj%W
z4g}7Z2OURF)QZ0kQ|h$E>Qb*I&qwOcl81;6RrM|L{5=nJg!gv%?8&dpu^y4tz`YJD
z&v}CEM>1vk0jExwu^pPkP?y<{r^kOag7K1n_v3T*uHT>|IBM`RGna7&(1bv+M`T<g
z6uxG=|6On9fD>6W8dNiFjqgK%Aw~5@F>}v@1g;YQFZ803zx}a;EqwMm`Ym)C;B<lV
zm;VFIqW??gk=-hY6u3cT(kr7M35e(}oFF#XVfRZ5J)p*vQvj2tuhZ68_irLtM~&<d
zand$088VUQ`ooc|09a6X8HgoO%bWu@;{@T0D|*6}{g!<M)VYHDA=BCaz;A`N-@LW#
zx^Wq_#5TO$Y+oIN){h__py`en>rEFe&N@8)mz}HzcJc}4Dv(beCMU6!D)5UBbCG-p
zopM~s$VA+Bj6$g;X;~a@G9Fr{(AVq6L#aj@e&_lc<=O>aZF9T1=QV<NlX0Dcs!v~i
zn>iX?E7-IxZRwC}<AtjloI59PA6@cUw;zIB2PW(3o3T?I$_vfO9yXMS>AYo|M)KXh
zF*X}sc;B^wu{~Ly_SsxY$yNAWw^jxS$N+d26^eb216qQ2UMo>RUjpw*rM3YijJz}~
zGODV!5aDZlE#pZJe@tq>pN_^>VUnGl?WM=&teb=A!<f^o;uZ<^0J8cFvIb8%T<jEP
zWgc_3_H~QOnJm9U3bqXWgISq&Dj~Vg_f{^VpJpPMlEZXY-)rsX;B4v|+4MGsw`?ZC
zK)G9fH<jV!DXE&0)%-j8>A8yavt4h$bcF5Nw1&<~9mF^=_KhFrz5o}{&<-`=QHi-}
zX4y%95xFa#$8{&CN<_+u;)f2g7q>l9rlr1xs5g*H+4JULzR5^47frbxbPhjA6oJ2<
zqt4?HQ4hqb2mBR3BtN3<I1?_@_4};G7Dkydri3AfoqzzC?1#E1NNb;qeZy{*P8@rZ
z6e)db#r#^U%SVMC{VJ>8Kwi1~HmrD*FXA3lkADjjJP}dq-s+x5W<S~Q8r^NCq~Ub=
z*hGq%_UE~v9NqJrKW-_{6FAtwZnh|}Ck>CWqV-Yoxh>joM3JL+W|qa)vQ(pRx#m}2
zPRmxSJ3U$9J0uh^)IgX9Fzzu13frNZ6oZkO&+_%@%+aJ>&(cKmz-N2BZ&AK;-c<~l
zp5Ll&#8F+ovUAtfQVwpSnB4+y2}FrpXJB4XakO{?3$pH$p%|p_O@>;19|@9Pj5$4`
zSZ_PA%$FXVZ2kH{Ot;<*885~q@EF(_@&IWgv6|({jlT9v$}o<Gw5f_T#;DEt9$`40
ztFPgpWRgOkKh##J`^qU;?X~SW%3?qMJs4Nb<}0B3(BrAy9Wn`OT=0B55No-T--5@7
zSvY_ETsM}^mH7E$)Gmj;;mvjSuCrCgkT>|42CT<`9JUb}1I^_VYj7O|@vxKR=%yE8
zPcgcgm+rVe9|#IBNRT-%8d=z5CNWY^&B2e4-A3Vim7h?p;!ByqbNvck8^Lcn3tFn(
z{UgOBlXv><v4|ZrHh0y}dwJ9yb(7{|ppJSv0IS(Cb0DVZ8oTc-4mcZ$i!AxD=`Mgx
z1<duM$;gNPuEUy^+j}ET&oyVoQYgNz8Wsw!u(envU7Se*3v-L`c@_8<9D(i#;1t&W
zD*Pmv<tWL_YnmsrQgHxWU92L%@av6GSVY`Mi?_%!Z@`g-7+cu9)X9e}JLak}iJsE_
zcFp+ey;t*j2hM*pQU2(&V#?;R05<9=4E(|6A4HZ~BYxCMBn;44pcsFq6FK4Kl4k$G
zDW|5Urw=DcQCw;Yl}p9XQzEupFt0U+^ln)(!KCvdR_@YoR2(qBR)oo*OK=8t{A7w*
z;dro|jo_Y$Mb+N*5)ZE`k5mq3i$nU!oU1kkKEJBjyfm;y1Yr=waqWPu!{-NLB<f#F
z)RP!@io%lGr*f~q8Nyct$;`(j#@A=;;=ExaWj&gF+z`$sh@rf!HiPnC=qXb>RwCgt
z;oj?{Zd5FG=ZaOV;}l(0&vdAnT&$2nX)ifl+UkD(OTOu`z`>rlkL)fS)iwquGK~8e
zV(d{IP)3fz?c{p#rgHE%ZzrYjmA|a7!kdC#)16mEbRwHh`<hHEh+XC|Oozuevxd1O
z3W(Y1N@F!L)BRTDf?7&vVF72?u3Cn)8JsL55E>q~e#?W3Z&6-vWf2Hp42YF$th==Y
zmS_Mv4_OO3t(;6?NydpV1^NkChvd+*k7-SN*7|wydimIxs=^u0g%$#?FY8^b+YbGz
z)7$-e%C~G-8e>DSV<7sM)1zCmUSK)d1GhlIA%FBITj%?@?y1m_wW~*eO{!)N@{4;`
zYoy*hoTNfBpc<g)3|4u~@`Ne<G|huzSK=8MmO^YJT%IX9d9*IPWhR{KNymNJL&Hb?
zUk6Icp66UTtvm-Hw0gq?st!;rQlqQ}lO}6rKu})rW8T!_e!R_VTymY7p6mVtUmw(c
z2pA%0m!alrK+jbu1u@6N-Us%m0d&num}x<!q*a?2e_vax^a|+<<}<lgSNWi%|JGp5
z9gX6Lo%Skcw(Q=*;vYog02W4o_k-D|@c#O?NYn0|?L)yxK_5feH|+Q0zYJ_Nsu$Ou
zdt763C7%`I;cMUNcCr&2HT<7cDgZwIpXa&|_zA8!HF^$kJ#Vld!9^@IZR;8!$ra+L
zeeKx5^!rG)fLA?Pj}N|ED1JE+8XoxQlls8VM64*dN*=;cvlPR^dkxjx)Cm73|KZTS
zwqV*3;^ya9l<qnZ80l4Z4>G_}&m-e<w!?=(cyIuYAQL+Jeh)PpkSMNgI#c)O^Io_5
z3)VJ<Doa=I$YT`y(1~wx)SF1AXiyhUY&RSc;YEJKZPnOdVP89Qx@|_2>S&qcmg6##
zYAU4NGB0uBYhd}YzmV$37+tDWC(vN<Z$}HXD0&TkpAu>;3p4z{?!CSq)VjB}!gJb-
z+ee7!=TV@10v{WvUuEU!t1>uYA2Rx0vr@q~>E#%Vg=^vBxUS0s?GA@eoq=w({wn)+
z^@wnBkVY<Z^gB*^K9w7&M$OI3S)3h%<{r&zjA@3?zpi=x=HtF7Pka49leLfHZtioh
zcJ6w}vuFp+O%H&2(;a9@p&VMojINu~7@h2WGy+pPm?Y=XdFTGf4lmtXevf*w{Y5{}
z?scD2?|e3vm&`>x=Gd-W%X(y}5Jm)|n9}X!0V$?lNX*3~D;ihs={xnE;`Bm#wb^|h
z_nXmn?l-U|TbeFR&(uFz#`iF)>A4{8NJ9Y=S)J5Tc9L}I&h|#y_O;VlJ&kXuze0w*
zUZfdak?W`zUN118e)6%<XuUy-rNp`qc=klluq0;nHpB1;>sj!teXoc0D~mm+?xC-Z
z{91VIzHQL`ami&$ic-oRqjQY3+nXYmcQ2#dh?B$)LM1xh@F?8Z=6LB1*(In$a!O_G
zR(8J5^~{^vR4A`|VfUn1j*rxZ0413!@#D-TR{Tu^Wx5LZ15azAJ7_NLsW7_>V_Ml#
zOJ$!~Bex=dr=klg-XizbEK2OSo3#IcV&cdqge-XrIc7%ztaL6@RtjP6nY2W9B;)w^
zP)X@Je0tT#nwt;5UdR+~TAQ_aCz7xC_<8coh5Ftjcau^Hm#tv6q$LnzadE&i{tUe^
zrBgJu=;re;jRFkr-73qs@(rx3eClScRZ|`Ar(owlTWoTIBIJ^WSk0iCzQu~Mnt<2&
zLXZuMpg)d62!ESWipQAm91d1Znf}yH8ql8lFq7*xsd##KV&)$8-lCA@tk5-c68a8m
zn?6680H0+F&&?oA#s%{xAkT@#G5Q+XEqTZ!raw`fSr|4L@8RWs{=psd{6_#nwt+r?
zBFEZc=;u~KJ~WL0Q!^N<*luKvZ5%`Chof3N5YXXtw&(L?3s*$b!!CWh<1dyI*;%o=
zk2_E?qQ_Z+Pf7=p0jdP3Dc4S7<v3FkR2M0G=lJz}DE{rz@#e1LvuETZyAtJIAGzP^
z{@`8nONEe`MyVc_0JtB%zleALa7aSFh<LO(3Je{7V?mD_F7&$YN)xs|Dk5pIekK06
zqU0kDo`SiZxsflQ`*$AwhD$#13w05iYeI{n=AIY|4Ydf*LapX(nXsacaorMc9qB!A
zB>CaOps!)qc>4I+40^APmG<5af2EnL4`T$N<0x-n{F0VNLG9=Z>k*S5IK{3@@?%-8
zjw6}Z67m*bi`Q0vaxCb7Trcr`us>j%kL-=<$9itp<(_XnhTKdsKv19!UO^sgvd<JW
zM<%YHM7~{S>|`DRQsM?DQXF>^gO<=BHKhgu)s^A7T6{EtBw^{(9eQc^kgrzw)sI7?
z<CYc;9SOc6%QuQMydG7(&0-dEIhP0^$J8dW=##KG1H$mtvYta8KT6_mKJ`<qY^-gG
zJsIvA&SM)9KIriAsHw@i+~bl1+wP)4x1-t;f*}tWW+(L`(N1vq3!*n$jcQU<?Cn@P
z`atwMBJ678N})xZaK35%z{W8f{S=KS$CSQU!J5%O*~=vEE|#E&=ZmBZgP&ucy9~b5
zlsx!eBj1CkVViLG*_urp8CO881zIo$jP2O>peaTT%l%C~EQXAV`_))j(-PAz;kEZo
z*=V(Iiex6)p7ZtXjwcdsGG;3Ivf)VJX^ySYx5M=`opu}_m~xCnw!VaM=c|W&-5M)~
z)5S9B7w-$)58ikA&J(|b`FB3br@^4=x$QQq*brc|Qjm|;Lx>OXYHjVLtr8rkhCUda
z9ojdN?OvPwC4J_M(UY?8{Q`Zv4>sKMZ{5@K1?;L@SlO_cc^rj7TctCpU++z~6Zx7g
zwDU7%B}1Y#b?i1WXWCQ;`sEpBSKXhSm{thCaA-S+-+`^ySyx&3-=VBD!dw|Z??5R+
zFt=IaHM+~ff*6h6n-q3;;HkqQ%|@qyNyR9S!zEk}TiHuTxOq?kK2)H5+r&-)oa7?A
zI@ZigeK(#PuvSD&pDZJK+mm}=%{RzL5ki!8st$mGKxS9|9qp3)9(x2@xlRy<mh8cn
z{FQQ=2?F`MSD`PUt%3`hr75v-wbHEHte2qYZW6l(&?72KK;)^I2LAyaA=^7Z9t_~z
zKOAq!RxFgaR;^%9PWN)Kn6-<^zcD<W(GLw3F?z6TkrMz)L~1E|?PD1kydM7AMZx0v
zfuGEH&5{e;1(w8Pjoxw01-KMIN0HPZ+~$n*WiTUM3Cu{p0D$Ko9cqvlezM6@cj!B6
z2S&>eqoZCodM~rKI5O($!fv@PH+hTo`13v#y=dJBc3;$(;y(0MxSIreSJ7<VNd{MV
zDCpzB?)xIxvw#xvIwiN3s?OaP_~NP1*vh==qjJpi4V*BLeuTKONZ|Apn<E(ln1eOG
zRu@M#!)wj8%#-_POUO>fPO*OZlP$;H8lts?3xT}aZIs?^l(*&#@RT=yM50Y|^hzvV
z4tegof45uFZ;P$TCB9_9y?U6|11FG^+JVw=#%U@#0b^XSc&n>nT63Deso{BJ^X20m
z!g}0~KHs^wop1y-7Y!nmw67F^BX9cv2a&TwzFYKrw2uvi`Pig*h?uz#ftazHqv<K7
zIb(11QvKef_gFn@@Mkn2B*-e4hoxO(nBGUZ5AY^x5BFue-C-J)ymGMbx}}jn+yOJH
z9QQ?><NoDtEO!3igrxBA0olxynd`Bs;Z`YQ3o8Hn2v$C;{D4-mnJ=(jKuad0gkR>b
z(%Vh#HlGmCT73~{`OwccgNQ!=Fc5nDi=wg%^+g=2E@GwPg`h=&iyK<b`$+$8!)^K3
z@43Ga{sS|gF7bn1SPPD1j{S(ED}IMI4!!xo<^kq!{|i7xUCIn|{lj5NJ;(j+_rG%0
zkbg%1+h-2{mlDd@yHH+)Nu_yU5IVtP&|wxs3!_Lo8D`ox#ZCfX?^!k1$%U2}q3ry|
zXf1^x{Zy6X2h+|o!_T~86Mq0mL2b#2HUQ`f5<c@W6g*kk$(X8?)T~wLm3e{oN!eK@
zK`&B!Q$_BPUtKD9A;)5}8R9O%)#4ZsVL?)@DgJZ>?agRs(c3>95AFD0V$_?O(u&Nq
z3e>v~<>;n)94X!VEM?ZE@`tiC%M=9HUD)>oDzGE~=8uB}OZTh>`v@NW@%r3qbJDMV
zW!7H104`Km?BJCyBChzx4{oM-J)P?f_*@jb_dG_Ax_3m^G^zO*<X;|k{z9h7Z71~f
zGmV<cSIIW^3HZJ>x0TPWE1i)M3>9xwjbQ;*v^|Mow@7yk?bu(|ob9DKZdaWW_Rd;m
zuR}%Ep)uvIF+~mzvG6U-_v|TTomDK#56GIUEfJ|6t!wiB4+x%=2oq|~a~lULzTj-d
zcM;7}ezgtf8|fl(+-VBhZhW$TIP9<-RMVIisW8MT*q^k2N=~_bOk&FSv0vw;Dn-FQ
zk1I*>-Z_n1<Evf_po=OGdHRDv5}U&Q{0d~B33I9ssULvDzx`t#P51HU9AF-&ok+Ie
zGs&`6t*_(YOWk#lLyM*V@irhNUj}&#$L0$Hj=T~uG)n<6<_Jxqs04PZS1n7^ZdHu|
zEK@H)*lWS_1r7lWv33@E*!adkh+iE_5H(ldg}#TCfsu@$Em_DYf^bdQNk>-SX*=VQ
zCyp~!k4+9v9zOV$Q$@GJ*su|7rsu4jmP6Pnkd+}sFbb?EH+z_Nc%OP{gIU}*osf^F
z0o&}X&wNN7P&r3g>6!~=wikxshY*QqZYD3lkoVBkS`l$|1YZD+#sB(|cxYqy?ODrS
zvAse<<IX=5-}b5t9NXGJm^7Wk#}Yi$Ci_~6TbT~z1i+1yml)0){xbG+FfVhpx5v&c
z{F-|k>9~eAhwNZvRCf>9J|=|8`FX@r5wJ9$V9+EvzmUYbOP=9opebWW&!`{04BJ<s
z0g}tU`RV&+#9k7WpLuX^+kujU=4OBjZw#dth;@1Zi#sq?1l0Z$yf%I&_;|Uh(bMj)
zWoZU2yH9%r962QUtYRyn)N%j9C)G;kRSZb8qs~FCXFx`QtS_u(ctEZDcPDk~>coA1
zojGrr<KwS8rFtSjlP)RjVj(%D+T3XpZ$|7tSJEVq<LZqafT*WIAz^?icAifD2>Ib*
zWv=X)XrR$vo8<)GT=&)bB%$ISHKgZx>0*8Ds>7_mp_NvGI?u^v^iobdX36;_+nz8t
zkARgBlx#*QJV3H7V;l(6VT+VNf|K4KzpJwHTwOoE>6?BMt#)XaYJ&;!#*V`uK$Zsv
zv@9<T((3;b*SG+;(|uv3T*x<U_(Ijk-L=Cl!~VW+mTLJAEXZG9G3A+*J@l!8+iyFl
z;n=@?wJ_Zivbk*c2~~t{+{8Sc8;sLn4{jis3OW1kI*ydj4p3~@!lFqdU9;|`Rh>O@
zoNl3830Gg{Mz<;>d;BS1!7T!tDJla43L8<qF4ze|Rash0%*{8tLtU#TG?StrchxGQ
z*HmC`2mSu%hK_2r<c8_AbJH7q>v{8Em(GKY2!e^IMyAO}tO$G3Q9c!@(Xyo)8BU-4
z>MXS-GuA8>y?hPcj|VN8u8c$Jjjdtn$GaJExh2n>ivX8O47m43RM!B!_dL6eSpB4v
zc&}z=4^qf9?3;ARAf`5PZ>oBiyyxg$Andd-Q$PF}2`k{2!WkDqRln7M=?~6-9UMO?
zOvqmd-m}+7P|)`KjNc(utz&+N%DWgKRngZGDlaEph~d-W0;<6A2ZFq=Ot6uL0_+|%
z^#(Ung#Q9fr*(OPVLZfE2$SrPy!Mm-rSY52`YE?bzE26^3sL0R`Cj|6i@wy$k&9dX
zk@+a9@c^vIP2p$gU;=156e1%6?dT1OGOf1}+-jZfL<G31GSZuK7Tn~mrjW5C$5-}r
zz1P7^+W{@itS67HJ|AK8fpyyku^L<Cv9n)+A`SRoa|-_(EPd&FhNDftKi6nBqy6)z
zcRQNhqsaXEhuk03v4%8?#TEP~abV&a=tP0A>evx!yF|iu09AX+bHgqLrV;+dUe_y}
z`Y}VMRoZp-IX|yb-^Y^jJ##&$j+ail5C#pz8fz-ntv)Y-wNu_?5=cO;Z9~{eoN$Yr
zlQ*>eZ1G0y$WOfHdhV*~FIVwWoa>RS>_K9_6Dlap9ntn++!EYw8L-?Q*y$D!K}MyA
zIkEZB;OzSb6~=0n8&%zDCNc5_{spYu2xrB%lL3mtTD8Bc6we<_As-)dRZU|Tz07;!
zT(klJcJ8g%R&+grDQ4G-+X8Iv<T9xv!?Ezbd+DdJ7>lfn!&g#IH;l{=JE%@MfM%C#
zw$T^VICb5RJQE)nCQ@B+6B+buRuy3mH@g*9U}(c0fFLRQtwm|*w@jZ}-IT++?bwtr
zr8#NA#Zths&fL{mStRI@Ixq4gmsrqs!}BYlbRSlxp*GCP=9zY99DwUjF_e;W;mB9t
zd<$5GGc+i`U)NOfS&F7?^~;%s>r$WZptnP<O~?uliW6fCj39U7DFLKl>Rq3;TP~ds
zStojFZt+8Uca$Y+a@>z+jW4IszxixjEH~|D^9^DZVT2PLOjhqcJHB-2=JfNd&~P2=
z8mq+KzU<O~G=&l2PT85Q-Rn#2_Dlk$k;<62>-xhX#qPxMG#fC^gehCoZl<p}&ej%!
zA%<~9b?M<{U!%Mfqb~of;&t2W5_o}Gke2iVzw{g2AU5k&i!653kdO6<;J19+nyDMp
z8LE5jX<@>Cznkw$cfEN$)q1q{wTKG6?YIJK*t9}{GTZlMlz4|oJ&l7^xq-&W`EfQ9
zYBgXV2t(P!!DZg6ul5$j=6Ze=4Zo_oa>}>hLXdP<>Bs(<mB(r4wvG>?0TDU@S5x1K
zo7OwYYDUc!BSn4@<l(%wGQSPO1QNqv?M%#4kcDc^^%Wz(GduL+E^gbMb0W0qaz-8b
z#F;hXG}9QQ(rmsM#zA(k?u8alY9gpCb;^av?q9h)bUVQ^b)ODn&Enu?ai-81`qOK;
zey)D$3W<0hS?kfMHpOrl`-w3nC&i(z)WntN+K-zJ)xc>Yy+p6>^bh*oM_rc0DpdG&
z22k?QTsf|qxYP*nVJEs6D_64&-NxB1eceu%`#~_z5DGCn+w@rbYUHHd`AQ4*EiR)@
zo^Brf=}DtN*7bjP2paw?CtDCau8+OI6Em?`3@sWIK&HQ$RqQ@A)yyAOQCo7L4zdxO
zd2-EojJ_7>otpcI<1kxzI^zAO_b<!Y?xQtBfeO}%(<Vp7ZnIddI{#*u0(KW#QcEp{
zd=l>jtD#dZccfHBmZ@A*UpjapdRB7V4x?|z*c)KuN{8N7A<U(sgvv2O$z`20U6qH*
zFT6;r-2&-fcb^<=&X3)<q?4k6&qrOgy0a;}5zXF1+lH%$x?GgVb#X5p6s056a^idn
zSvkJMW)F3z_BRfOE@jCSwCEzxTK~f_jiPJAX=FVDyf>*c8`+9OVB7)o$cG3fo=69@
z)C~X3K5aI3Cca}|fBY*(-L$lFPb(KkxLwEEu;wzX%Sn0*V=MatAnF(ib4JRgxvY}-
z<f++p<wxnoM(6GzLN`SkYXZEHRFJ_;-?gXxFu|PPUVFvA=yPcb<hDM3+}HQ{ec^78
zMk5Zn=RXhfumo7Qp}8ysc(&I%2{mioxSjnp(|M?4ul?_3u>n6-Vb$9S5+N1456JRd
zeQ-le{Gs|Q>Eh?B&@_P5A({*sK~eH&++?>wI|w8j%oT0)<W%QK9r5~b?ex*XZR#@}
zf!hVtU-&-M-jkh`cm6nX7ci?IlO0YnM470unx_)P&JgsDUtJ#yETaOB)i(9mURf#P
z>F+K7eCF5I27m#d>>fw**Ptg$$cYO%<127zH}!jx>-BZ7%>Jv7J7vVbPlvpkghTLn
ztTgz`Y6LIT140e%KFSLSff74q)MAI3`j1>-r?x4HoOE5ef*-pO`mMw7v(t^u6+Vp3
z##@BD_TrxdKrkPw(0KrqyNSXWsd46Ryn`o!S^@}pWS`h&g;0ib2t_8`?EKHl$p^`I
z7R?Zy{u9JboGj!4x}TyGX(fPl@WRmxtH#RJH~0*T{O<d7lEfIiQJE<6>2QD_MWCnL
zs3%;Ucb}JFB=E=F$m4WC#KCTX<L63kwIPX{?wKB!i|>;ft~{ep%gFxP2}X%NhgwYu
zSJ3=`K3aY01vUQrf`MqKgpKh_*rQ{s!MOC<BERp8iq{*09k#C9Tt7B3iPR|9Po=j2
zpK~j$iOvRJA$#!L)DYrS%9{P%={}?5ze-$HUyiEW?Up4gN4fE<oIUlz&31i4*>^6T
z^V>gH-w21HIUI4N+`S$Y<Rl=yjgeunThLS+<P<l_8OH_I?K!eWuqYNIpXuai{sQ*5
zVu*UL@s;n2G_~dz>^qhCr6&1QvC=n|2JsZE3^;(2OCX`$=0Q_h>L$NAJ8e&(;%+xv
z`Kx}(tvacbCH}ehB6<O-a(hQa{AIAq{ZY&pYa|4#8z48oA+%!H;hG<KSXZ($k5s)L
z=!8*^4STs~%YvEhaG%h{@O9~qFCW4<kMh;7S}{cs^c!FW9?yVnp@o%UnzG{AM<+ZT
z?c$v5B(ZX|OX(T(TMjYjn{N*k_P!iC0IBFX_2}0nKV1_b6M!Adgs5i{{3ou>cOfMW
z55TWMwkN^<U+leiRFmDeE(#(dAfR-mM5QVqq7(}f6=@<yRC<YsbRnVwQDUMXy$J{i
z2$3dAi*zXxIwB$hN(o>{P?Qo#P(vWe``yKN_TKl5-{&4<pK<RW_w0NAh#9`TdCMwu
z%{AvUpGWCjhT<N^iOEf>qy|*=M5I?x+?~Fhj{XVQ+Xp*rc-L!9ByD*A4A$DffLw=h
zvR7#eKwIK_7joUIc=T6~wOE7*zWaN$kr1GbR3J`&MUjNt;nj$)m=@86x-TNb+G-<L
zhTKMWg`TPUXkCuUER4B)xcMeZK&h|$Ao@Mal5M~uGh@M<=K11Y6?3GWsC92<O7Vgg
zuxPB}4`M_0U^5~h?CPpS$EK?8c9H_wYnwcF4bxuo#cL7Xx;%!QA>SuXzb488Iv@CU
zr*Z9$<y|bJQIla6%SWSDL6VNg4x$6ooX4#`go-)-0I6m8EA4JEQX)K(T!UOp2y!dr
zc(P7an3Z_f%dE)yFD>?p=#`blhVUr|Rz<Dp-+6IEMK$LkMs35HL3dCS251UwRgNSb
zA7R&X%(kAI-EYIu)-O`bMiG!cD9_zBDGo_#vYWhk3%3>bw<?_ajU*ANs~o9PW(haT
zFo=C*0bdDcVvG(C2K=f81TzuI1(rb3B|h3~<@4S{cMkcd_I=4?(v<m-D2POS^&ABI
z`T4j{IXmxBn+PmBphYO9@|;NFUhQugR7@Zke`|hlx<UNpkSULH`h^{gwEftt)vRRj
zwA+wqQ$&<DeS(sZ-qPZd&uYBqXj^sVUG(VQcPEAdPAk0Kzs2End)wL90^bdQK5z$F
zo&)^8%>-^L5sYzCq96D0WrlWKje>CZ#hoLJt~dE_Th3LtDaY*N>Df#3octL6lmH*8
z$et!_om1Ra&k8E`sT=&EUuW^Wt9Z@VKfPaRho+H(9HvrsqF#2Q3nK>JM1mXk7Y}6y
zS$UgS-P7rRcEW^lh?{Y_=9AXmpZ;3Ev%SY}mhR6qjMP^4@^LV%TlfTvFxusfr{93b
zw<t_t-{U029yCktsdLf)NyQxQeK~*8vCKrIt2U)i)^F(ati_pqPal=sFy;dTD%=VF
zZ5LQZn8$qa>j)n+&aQC~l^P;~z5DQGpwbh17GQG*)=bkt?mjWA7)mAC$*>Kvpd7#s
zY*5LauoxUa(OpagE!T|{#<);5$P(U-Nio`mk$!?G8#nv!_E|@B3d|`#Pv_a~NQp0F
zT@W3fR^yZ{GwPvUlVT_;fAlR<xWy3qyz$~Mxy45697Wc4qMV@(C4HE4yx8m(bwOmu
zFNM8Q-o15}Zzt}$%8r<=oL=L9a#Tx4YhC5<VyPoIU-Eqzs4J4jjuYiM4g!hSIbPFz
z#lE}h7!n8SwWb~?_@lgcoYTM2b={V?f&SoTh`4|Y+ZEf2s|69p>!CQ7I-Vnea~ix&
zl%4V!oXg!$&#hAlKg&=K3=q_;I*l7Gk+hkV6x%0zWG(!e5rQ>N!LjzM(Qx1z18^CP
z4ST;M=5P`PA74fYa=F*+lD_~pHdp)is!WEuuc(eC%RJhTQfZkF=n5s!g4i+;Xk~uf
z^s@{kc<3=F#79oZOJ5_!xKj*)@SqU=3<F!n9k<->Z%OGXQem3co=+8CJ^9UV0XGit
z0(4tKG+9zNdzlp43T*~Bj8K)=_s)~vLH<tmZ$c7=rnE@<Y~8IX{wcOMzh(G7gXtel
zA|3b%2?Iu{8K{`S$MmhHqq>lhjrpy0kklt6TjKH%N!KK`uUYL{zUj7aMV{TkWxhOp
zd^OxiI1otM0{4f>`H=;R=>SB$OeIH@i=(!q%{pzL3$wj<JVu$dino>@qA4q38RuK&
zy?zced4X@YiXDRkhCTQ%t2g(w*wRoJyw+uv)<-3<_8HJxF(>j_1Uvc)+R2nY5b3g{
zwRrvF8PD`t%4f`Y_Pezk&ew0HyuOT(W_7Xm;Mz{XJ;Abth{pvajU7KpUrqdlireK<
zfhYX93oQSb<Gb0x-6jnVTQ9v@a}Y{UBj32hb^R#z!!(v+B;$%P{$#Wh4S>PaZ#mEL
z)P`S8NA*gb1P(c#dZBwZz&+wg%Aj&nnxJyprl!U#I9-5%(&juhTigi@_|FdF`Xv-{
z=-ZzU7k&O*9#NJtI;4E?bi~)>w>Bo$DHeu20+&d60X-i9Pt^1uR2lXItUOpzOOd=l
zubNnykpxL&`?EY>mZJSAQJn}O$NKnAA8t&E3xb)aY?~4=3AqlW)zOl%fe&>up!tQ|
z(oth`Er28j5)P0PxY8%w+L`A$0Nw7|0BHKfi7)?nxT>7~bc}c(dJJN8UkzLTAJA3u
z5WZ(XqQH1Q)<?6U_C^8d>2AUgUt%tjb<7O9Fg~-)76N6)H^M4l;<`nQPAJ|Gg!uNF
zCj7<Yst@SzwfxrFz)JQH@!+K9|H^60kw1Qa*<gafdTjt8=$BDpI6!4nG>spZVa5#U
zA%y10JFaieg)z!W-7Sg@pEx54HQ%#?uV$W%|JI89^5Xa@TljN5Qs#q=6}$`(nt8%x
ziHkL?0#GEosIl%&n7svwcmamUgDeoK_9*Uxts~q34*Q&u<mGz!qZ4-1Av6`n3?wQ6
z^dI!thcU?P=`W=|l=4lp3zx#p!X5ld1B-q3eZ6zba{H-<v{gRNk*+{ob^YQ3n0cU^
z!Ya_If0stpaxO3gl4_9;FP)+y_<VBD={xGbL9H}2fP&^nVrq+DCkpP(9NH<`BgQtS
z%P_TA-|1iy>oGc*qYSC?e3nGdmtDgE<%C|6su1bQX~~v|*~5No;A9a#%yau~362i3
zA7jEZ0yKTBJQfh<RBdBs;kxk;?|y5^E9ei^i6qT;zV<0E?0IMU+pOcpV2lzVIeWZl
z0b~hehMgI+O>8G@FI?cZ_7-Q1$*{H_&CUUEXqd6jpV&X_^65JNi@I2c?`97SAFW(_
zeKgSvk%9Fa0vP0IGTmkXNW!}^h>LyHfwAQ4G>ltO@3jnnhKzB8bcT|+lk*)@-7i;i
zlpY`W)q8G>Yrzsg)0i#Fl;GM<W4E9YuEU;4kU5ey6bun1QmkArF}nwUA0Zue6{jDu
zjWvrrcuj~MtQ>a8&I3a(!kB?YEd00FHc$#uPh{Pq4<JUujXg1LsxLL`vgP-l7#iF2
zQ~0^^qnD(H{hQD1v*qDTZ}?F^VDi7TS!z*>er!4Hr+NvW3?miBfd_f$u~RR!pIog>
z*uBG26MOB}_9{U|gw7)?@fLYX#(FUW^^kLzLJZr5e^8?#t5xGCBI+j`s(k8ekH1ty
z!d}7QtIvc4&H)M+;E<XTueex3LNN9o5=MowrBW;s+U<SI(T{YX^6E7wKdayoRadcf
zN&tK6Fu<XtZhI@qad}q1k^L9DztqByxl>OUcu+%DwY=v=JrQ2}d&UQHx>M!gGuT*6
zik{Yuk#~P;DtfRr3KB^NlhJVw&eZXPt?PJ$2Js+%lWs(C;~jgTct`m3IBLzL0)*P#
zr9ni#K)P9&Y0piBPgfG@=Kw;U)yG6}6LoPsXd)%}O@&4IkH-3#fC@Jk-KovdJ<2*(
z%(A^^JA;tA3K7Xbfu45`R^`Ph!?Q1f@s$B~Rddn_NEz$^QW1Au4s6~HXh;TPdbt(B
za#2(r)RRMso?_x9#?e?S3G)ZDiQ2mXt&;h&^wzIN7B;6FJQT0K;2z^f;-+(P0F|}4
z3>HRpRs`Dt&@f4256AUDVs-VOf}Ra_5FN!QGIxi4KIvV)Cw7Na3$^E(yV?;+>@ctb
zhyfCDdYKkphHEncCh^Jo^VpUH4676eG_kEkX-YA(+)U{C_--^ay28Fgb}imLFyyD{
zcMBe)U{=9~aH9;(gY6H{xQfkIY;8h&Fbhd_S;t4aGwRYx&XT0_sx-!S9Wi{l)3E%j
zx`W{4yS$J}BQo0#fIRAYT2T`}!Bh*&!?vY6YA2$aPd#hPV`aBxbv`G}FOeqC9Cp9`
zy)PzqqCMDC?@Ok}zUy|w#A<E8?neL>BYNKJtF#joZ9a?vK($V|h#<~YzDc$DTr2p^
zR2sS*eePh{fs+38Z?Y9b=iR@z90M>7j}|e`tx-cH8}7hTn-S>nzv}(G>Z6|8G12Nm
z{tHQ)?ReJ2I|FBTdLA-&h~64ovQ?k}Y6p71<cIKjf#_N=iT$1}xNc%)Pmgu5qS_s~
z)h0one48~>c&p4is_vVceHFjI1DQjnTO!xah0uU+?WO=SHh!a@aIZ2JQEYoB?0Cxw
z@Wye_B%Ge)V|19|8YLD{gBN3YI(XH+8p_L5b~mg_%Jn*U=G19Bxum}2U6|n+0??Kg
zI`KEHjs;c;8biJ+j^Z4h`HM#g<5pL_t1h{GUI3BazO+5BaME7DCdkXyrRRE=A@Q*e
z4;1fRp-p+Snpg{uy1S)_ak_iLlA+g~<EKjS@>0w3M_~r{<*FAR(cWdzVBPd$OVima
z8#^&q!kQtdgHl`Orsu%_;yJ5X$^`Qf3=s7t)+(q!0mWbI*IyI5L8-~VqFmV%K!#lT
z6BZ5<fjh99y;lS0R<U~YZo;#+phM}2K<&yxYsA#%Yl+O}2Ok8ago7`Q!}!3Q7sP#&
zfNvo#Due>C?5dt6Bs}~A-DC+G%L3!xK)0M!nQ`!W_16ECHuGq$hWXNi2|yFsUu~|u
zs5F(UK|wr#(a(X<BVloXo?*(`NjGB_(Pv{QD6ww(91}^8CpoVHY?X%Cp->Bz>rItw
zPk;BE6XIHNpWr-^q>`EG(fF1fAopW<r-0*M%Q(DsZ{mr)WJJBCbkg-lz4F`J<<r-~
zcQo8MXv%M`<^uLHZNtsHMc}!QwQQZh=$Ow8PPE<CEm2a(;q<Cbw559W&1RB}^^aLJ
z68FHbvb{Hw<N_+tCq)Ii@yw|v`n?2Zo-C=4`+Tm@$XMEIL9cDMzm2Cx{kO9;BM_Sh
z5P%X~WjdhAS7QpWR-cVyRQ=+b5)_BK9FY@xTC&KaA?8lK=|>H#Gi6lv-q|^xFPDfj
zhfn8{0m5<<)RrsF;9po+Z82np&x8)YU=??*mqfhzRB?1S|K#kE%hiYSl0VE|wP)NH
z>q}zk#~!JGwsC?OVsVc8jQ;PPv@?EBG?bs%@A3Y*+vY$^=p>Ey^igT*6ZrJ&LoG4v
zyHF=qg@uz>>&DS@vb~w^H1V<CpyJYk`=<_KJ_s7__H1*`TGs$n)8eB*aX64|?FUkp
z%N+4ls!K+P7OE#jI7t?bbEq}Q(k~BIAk$P%zqRg7H{GooQZ0FHvh_E)3b0i3%}8a?
zti@<^z^-j&#S$fq_IcAS-6#w9^rtj`o!I-~+V1GQm1NSHLuUavMf=?Z9uswwup6$A
zo34k+a)RgrkWepBA-v3Tp~M(^QT;^JE{B^B?)6Uyy?6IoZGF=2YucIn)4Po>+)*$C
ziHyznv%APuc#fn7m9RL<#<g81*9dgU`Dsf6c@R$n?J}QbzTO}Yl+Igof5k0Aakb8;
zXXLIV5^dl<@G-e=oE&jFAHSLCpCIE}RW)oRHGy=RnoBjE4JFhm$2^(B+Dy!PZN3#B
z2+)#+AOrKpjM+EHkz)u?k2LvDqvDXFdZqqO9WAQf+v1bE5jj5l&K(Uq8S?n6^hu)y
zBbWPsm!tRL-dW<Vpx6p}NG_~kbeKrLK?ETnAdl1v|MlC=1%Mh$A;I1m!o&i_tb-QX
z@kOJKZU(%uNP|khEa^%=v1||N;^?f){Et68%5+N@e4J7wS!KinQ{#AmnPvxKX?5RQ
z`$H(pv;{`J<;aq4t6Ee|l9;`IPC-XWDW)-kZCYt{`^Qt|BlgxUR~qJdi$U-306F<v
z2XKo|S%|f{-|N6h-#i7@)sPL{gQ{B;6j$GAv=ulv#HhPT?J(9Ju{cO#9uN<QP;df2
zcAXhE#vicIEUt>kJj6>nd+nN+)Cz8~iw&?wZBAsSA`9WeR)q%BrCr4s$OpPk1%A|f
zTB@IJ>)_l2W;KLg=Le(nw)F*6T|ZbMDRKW@5FZ!~VJ;@_&v9*_SN&ZMw~A|Xl&U~q
zd?`FGefVXy`H*;t%z^3?AJr5r1C*cQ=gty=v73I1qa(z?tF##!iTk>W4AzzsWmu(M
zGRvjt?9TNtTH|LF5`KK&9ihEnX_u68-oLl6mO$n`-VM*fZ^gYE%~b_2%MXK|(n{A|
z3A0h{u$9{HF~r3Oz|AT??#o@^77~55!|+wwZC0Vohul291)L}v(UCqI+SfZUo!lKh
zo8llRDDN$Ne0#}Z=S!^2=HL?_Vp&leBniG?63{Q;#&4N&OKQXf<{{xBL*o-4hp+#B
z7B(AduB=n^K}qGbWzFL@CB86ILH6Ck{~6@<H|7TU5M1K?;2rh}T-$lL@A9~W^C4wL
z_0Q!k`@N4p-<H}|&S`uYw-Zy*m!2rz1`pqMa>w`8t1YKkwQL2n0X2gL+*@W{lr3@4
z5JCTjD!)7qU;3?J1gR#_)Xb(}X>T58?pS`8P*%9l_Tk2+?;_B61gY>iW+Ld078mFf
zzFH+-QEssO$}p-1C*zD9FS8{jCKs2U*)`E0T6W@-B|!Jh8w7ahgu7}nR+&P;83U2G
z64#^j(Kg(~#Q`%q>Y?GjgPFe(`fu$scS;*HY#+DMtv3*nb>HQ-*Ys*(tL*5ifQj9a
z2E<$VL?|Q(th4YXAlwxK#^WkCXGy85;p4BI17u^rkqL!P(wR!DI?uKa=R4PhZ%CVM
z!(K4KfZXI};NO82`<?;N3{GML!jm&tn06F~KakJ<N32&}%KW?*l7_&gZmX6=E97}!
z+yB~A?$Cvqb({R+Y)jD|hBkeM36}h@Wn9}HjwLlI0{_ZCJ%a&jbyaKfn+ScD9&bIf
zKSuM>WZ$+&&bnt`%$I8_-y1p`X0!%=qyt7Dz?^s^$46(Di0pe<@~*;TNuA{Fr*2$d
z+}kLVLPET#GzgJN-Mx}UiCP+TOEFJVSW|w1m}~hT_yMp2ynxY7mnSq%a_dGRp$)V4
zp(Sy(OFLAw^iIB|;f<-b7p8F%eZB#Kn@4``R`pg%wwLxe`Gcag-oq3GQ19u17;+83
zhX=BKd=wSwUL7v`hvr2P2+ul)ApRUFo4nW|rwo}LmO{sBx@Fr9AKaAH!kK)s?k|dZ
zhf{J@2QP6EJCX}_tKkoDlqy3P`jUv{^eRVLdcBxg)CJ`-lKr0N8>@|r^pI|51U?`9
z`c+CKp?MTMDE~3=V<%`F0-@Hhcp!=+1or5gyFXs}+-yE<a;svFjH&@n__;86^;>p8
z?$-A*>BNSoN*S6il|Q8k^WCvWBi`XO;q)77i&AV5+io)fb-xlwAcZY&5me%B>)78D
zeY>I7MSrJmX~OvvdMQ&yS+hr;dw6@CrXPNO>1dlX8$i<HKuWKK=%>w6wQDDgFEg?9
zdM2Q$=NOcGhulqwnkb`To?QP`+UJwq54M|5*Q@d=@~KbnpH#-3`DNV?#Gjtvft(|Q
z5e4Av;^7_mYD7{E?iHHU(H4?*ZXHG1o#FAd@?9w`M?OKY*JCZ5rdYey2Y{4|*fxAE
zYEfVs9w$w#?THv|7a>f|g-9R&Ev(U%0{orLWsXC4(l()xEp`)mggM&=7>uQ1!b+V!
zfuf8?cPJc3zG@f_2=kx%I%PU&WT0^%xU&0l(gicgTelUGevWga4K3L?&Oz222Duyd
zO1mB8{IP?s;v<+#S_w;UDt!D)%x-+_*QNJLWYSbsFF&$S4N(m?*!&VT0`lr0u_`>l
zcIWKo7B`Eq0a4;fET27k7fNbsF3TfksNnUGRYL8zK8LOw>8Z6Jv!)z^c_pKD<fOFP
zZSk!D=|JE}heLv&%luq7fzHKXuF>y}cI0^3$WZ$_Zyf5MJyx>wdEBA_udK-~GbN`F
z+wSwd+TMh%<l^De@vxoH9_VwH0qrvwBetwm9|ocayUQtP@OFaYN5?s7(|(fQO=q(J
z4|9*;+tH`CYyUJK3h*cmP*xtW05^*wfr?TSFdPtY5BRwDnFdga3Ab)yl~#1c-_a)>
z-|sW<s1REDrkcbHV#jD$2CPYnfoemIb#xGg;JyQ~$XaG~`6LVIwTNRod?Z}XR8}Mx
z`7rjYq_qquX+6D@c%x}--xYglR86VABjA90)czmD`5HeBY``NFa3tt3j-(!d@ss6r
z6E3wW?j7;>U*afK%*XBWZc+PrH6d5{Xop9Dds;4R-zOBBUVN*&jQ96<8dy72`xr7%
zR{|0QS(FJqz$w>PgD&9ccHCug+#K7QFeXMyf~+#A+T6HZL@34LZ{+uN1P5#{rwQyY
z%AmHCS>SYGGG-CGYHcF*>Ds@;%%nKOxVB-~1dQGe6T?vy|KdpkS9c&G##@%xmltUW
zuP<v@<}n6xs2_U-*X9qu3aIhw)fT6w@sVU6fByr&kz%X`D~8CYkPzYPME9E7<``!6
z4dm^&n;7dCLwQ-1j3!91fb9fO09aA9A0*UOfX-d*0$x?5-W<l7p@StURS^d(>fZF{
zPpcnnRoc;*&*{^Qe`23!16AX^a3nqoiiV)ub%39=xfrt%9Y(5k2GY0E)X7S^H%;g7
z*u|dlo(esb?Aq|sr!ly?7*R0KWK21_n11Zv{1%WL1=c1%j+asj+u9~?M+N@p08^D)
z4ED<~aYzLk2pJmp1xy%vCmd&-=q{ZhTb#H@CJ$~QO`E@KJ9GB_qM7aEorjvntY5)2
zu&s)^AOH5QFThCyOgN*4=uW?Drrp&qA<=ttuDEyC8~T}^_iqW}{>ME#Pk1^XV;6Je
zmZu4l^TaBA9Q?Xi^eZ!vaXQZGxXid*mU8F47G`*xb%xS=Qi|rpjwxm-Uz@VsQ(PNx
zl&3R+vus9lXK1wJ72sEUHI*29=p&WW1o561lgc|Lz3b;OH7;-OT+-{l^Hwt=u;%-!
z6VL|I-o^pM6%_3UxgwLd2?9Tuj(P=0c)w_<+e&%$$c-JCJ<+SnJ74oZk=+S#fDs7$
zgnOro^G33F)S|fJe!rN+gT(1t*h&nZBm;H&0~35EEAzT;5K!0_TQfl}0!QT9LI-}`
zSPGk9_g$MK&IeiBv5(=#j{zc(M_tGQoA!WzA7TIVHx^u}%^&wo99g5rjX8#g;s_gX
zJK%V|QwP+yY}H&~_J=W2(?q%xa_vk8O%P-O&KCm~`GqgU=_L3{ENmPkK8hpZzsUSN
z*rb0e;pY8o<i8i(7myJC%_I3Q-*7M}#{NM}xP%4MyUhaRC)dR~Mf^_F)fRV?5)<4*
z_1>h-us(EVKe>7}&S1xjO(zW=Bt71KC@_mcbO$UY4j^p>LPk|M3XZEf?6>Y+v2&?^
zr01srSQL(fUReX^?=alA4Z6e5!9N@u{ncw?J}`@d6oGHW{}V*UfT!QL0Zff4_`@;6
zTuLkt<Owo!-*I##eElGT(+uLrtStDy9)L2{pNajS>t_A;X$Qgpk+pB22f$5=X_@TZ
z=wb?)R?g;goXg!ltmW|;PT8H^O}bapxUKGOC(`)%^RX<I4~v|{OtWqAAdBiR#Q~#h
z1Cg6Ut|M@yWSA}V(Z#=b7zVC!?Z_&bD4Kszf~Ug%uc|MpF17{(<Ej@zlmjVe#q)K)
zb+4VlUJ&Rf1BwV*2HP9@M6R4AQnX385Y3+rwWp8LbO!{{jda-I3sKj@LJfkC7wbD+
z6;>Dcb}xSXR4$lnA_u@G7Vb@E>iXf=bu`wsSWbW_oZO}zb=ROB`LI!s5Y?<lS+HNM
zp&fgtBPmTkbFicYeh8twa?=WT4Ee9hH~&SAn~PO|vd^p^`jC4R8c;G;OK+<)&#|r?
zj0*g8fuE#tKnrDxznp3uwj}c=BBVE<YG0a6;#sfVH;;!%y*$Z(XSs_IbvLAo9O0_U
zLX;s?%8O%uWl!z($?M6K4|!Fi+55=!n@DnU{AeEF^xY42605wz;ckTSxDI&LXvfsq
zRl7lsHj`XEHE)C@U1R%Cj`|bdT}GtM?mt-YZBt9hzsVJCFi(uZ*9?|P$DzJ~SmLj2
z1R$6og0gpgG02L&vFWS&NhJX5vas&{BC0(SI7Ow~|J`V>0c*DT8TY;(o~1gv*meu>
zljo{Nb*1!IxXe?T!nQRR>UG+)4eRb4MQT(m{a*L_rxFk^2N<TdR^mz^KvQxv21G$L
z-RccC6xRm)y?Qa)19}c-Cm3%Uro@j%Vrb@PWrjdBP0h({<l9dNkHL{2QHdjZ5f?4a
z#bus(qVct$44((K4*#0JY{b<f7mMlI%ug(t3hW1bjcb(j8%n}mt&~m>eSCG6IR8ED
ze5KWnwSu6pu<PNec8w-lnmBwCN3t4}fJsdgSSo4{iPwZ-W7(Hk{PaF!hH6r%SH&bc
z@`Sav_I7i-{s}kU^w26x-L8T2TMi2R0`^o4q#MH8a6?B8tpR-FHj-090P8fEKZ^xy
z+y-H(6-){M_G%QXWfEHiSx!BWL^dTV*7J1|+r(ehAU7aKmnZY|#EYbW>6b7K2}6&I
zV!w1N?q+A5`P&4q;OO?+>t<@PaC$`?2;5x8b$kU*$1CKoM1&-c;vxft(@WQ{<v%M}
z%LbGey`Yt^5a;A`!Pc(5Q5&+c1EnO}!y%!a!1D8QpTe&=!6Wh2gvF?G&NcevU1ssk
z7NyG3;qwvCGaok_S=xWNCC#ITGYTVq%q6CS_t1ku*87TaW1|(gADv|-95o}AB9RDi
zC-9POF2p&77Bz6EI&i0rl?VJGtO+-C9D@LDaUIDiCCnMces5g7!(l)+EPQ_y$4CYe
z0zgl58UqeVRpM4El2{mRj?@H{JVC{AG_C^HUKgCjFYBPOqcfaR20UBrpMQtZSxA;f
zJy)q7bSulY3pgc>Q0kBTg;%$F)&R|w{+~{OQ|Lsx9ctYS6$2tG<r{4o#8n$K8NKp7
zX@$@UUH`)gw{aa5fFC9QN+e5w<<M!Pu@}}6`EeC(1F&z%7|z%qPWbVs#u@#EH<57(
z&xOAuO02#5(;5G1^BrgsFvO){)qcADseub1m@w$tU{KQ)%))~l!x(YR4mosB(j{0F
z4IY<+nC<w}DQ5rd31Z>8dgG$}|LLj!Z}k*1nMij=vebzj$-~SjFu{V*r4%n3KU-k|
zztvDX^j?ugAT^Y{T{Tanx+6rcJ5uV^eZ1cNc-TSrqu+~gXg$sfn5skYmB;}YZp`i^
z@Y^3sUGQBvJ!f%?aC;V`1BTBA;_l5j+7D3${7i6zT&lW%-kCIv%XZO{`&;<f>c8bj
zfrUhYQ>DNO4_OnCups*&<~E~vF|-2h)eVUabIhc}r!z4pQ=Nf|VW~xmSL|2&XAhkU
zWOjF{PQRuSTeLU*#q&0~05_UIgyI~b8vqu9TmjJ+@BmkEz&_12&NeAEo-nIiL4kUp
zKr9GK*JUhDP~i{YwSZ>xQl~+#SaZE!Ooay{e!Io<-c6NGV`~0t+bwze1dWy^OCu99
z5&7U%ONjMRw3i@hInc73<-+*cvARxtV95Jz9F6KR7>Z1Hz9MqK_R8~lOLVl`G3~cc
zEDkyvLcnshfq0@j0)!%kv={{)$SRNLO{i(uIM!0VR$Jb;(SyuE)UKu5!Nzy=)UWFJ
zpYaNQf6eIpO0ESrn573Q$54yE>Gj4eH9Eq$mQIeq82F~&4aoM)-V-Lt={qd$FclMW
z(;?=nRnUv3$#?6p0f0dRF1V^2BM$0A#KM2&wiBNp`!h$>a8!Gk6X5x`HQ!j1ZM9=`
zaV&Gg<G5#cauE~dMfK!_2~3y9)4-=*&rSS+r;tOWTN1{ctU+G|V|z#7)dA&HbB;}*
z|BEl=+F$z5`{6@Rw?EjH0(;E+0Vk^Q>+C;8UHLY2(!ze@dcT8~f#Viyz*chI+5-SY
z{+w|HeuGn0?puJmsXF~15tX7BzK+4kp6ik2hQW1=ncFF^&Nx5amU{na>&x;A|7ES2
z=IUgRNE^|_hli?Oy#?eHO3&3d-PO>x82ROYck+vO$c-=U<*%B>HDhIY0@IYeHU~OB
z=KYDFMX+_ymzaIrY|JJ0Vf4&}rlA^wRL3?M8QbGVgS$F(Z;9kyt-AK4^&3L!2YeG&
z!kfqgZABYQ+eI#UHV-#T<;{^q%DX<0kJ;azxLt3S>TUC-4}ctabMFiNc2wV}vi$>z
zNiK}F-;Acl4uF8qLU#go{u(@<ytRlyeAptpV32|~rZ_it-jynjepW~R>U(j(h=0PR
z(~u&MDU-`PcQqLIBlKUg4D1O=r~~*`IV{jq{01y*FX+h^QC8{nEE^SW=;hw1(SK+~
zRebN^o;uTdA1MnVW7Ad(McG>&@R;l#_n*UOK@X+k+CqM59hflw2;2gEpu@OB=Od?y
zHY!_|8mr!V`HacQl9FdUJUqM)?Ebl<<6^-TZk=KW&XKFqBPh!(^_01rm<vNHuZNdW
zH8HmKdUBy&`%vf!9-9<&YY6Y9++fr5UF@tvum7dZC#aFc(SH(I+IX(48sut=<5<D|
z<^Q;Ik1q)P|NWwBAQ#7f26V~lIH@;8dvQzG-qT;rsBE+A>H|VA9Fm^#q<$3Af=^e$
z>Gqwq3ECh+38|)Bxo<wms?xQ+e)DvWR&4yQokxD_arJ=}cu(dSAs}IX;BI|om_bs5
zxRny%Zs9r7!ccNAHHo7E%;h(m|GVk`;mQK<0=RelVZgh%i7o)%#Sh||4RTmTl9$sD
z;<W$pL|^>T1qI=Uzem6toTVwlRZ8uGvFuEM(^dkf{HuFw!#(uh?ZJt9@aa-=-2m+Y
z+Zo%Ft2797?SbwVJ{qI*41M*B)z#sy@)_AH4RqRj;gzYjeM41SkBGo5%zEQ#J*zO}
zhaQfk3?xWkgMdsMKqC219j1iPWGYB3ccu~DYg}(nq4#KfBO&&;RHDJPCL!KP-L%4Z
z1+u13<3Z&|h9?z)=epS(lxX?kC#s>XG=_F3tTsJ0C@_$60%kK<lO*H((wM4YUskkj
z8UMT5>EO!q*U0os@v}CM@h^c%8N;ug0(VBPHd}zqD9x03x#bW`pbahr0)7CyhS6vF
z7difLg?((6ducznelOlqq_(z3|K@(PUFt$5B|JProN|y}`WK1{Qsh^J`)XsXdPTu*
zU~M-n?tl9E@56O4`oDYdlcNueQ{OzUMg-)F!<lw0FZMCk{0xOzJ+qZ_s!+jJ78+J)
zbSb=G_<XENc|h{wTenHm-mM-UN=n4M54dxm>{foV1Hf?FT><wXjuX3Jb#{v#j8y7r
zj14QKsF%9)x0Rg}8~u6jutjsDjG$V2q0#WZiw=S(x7`#!#Y_SVFSCrHP16P`IJzIQ
z*6LxZH)*7aQhtnTq4iut)9!}v!BZNV$%RK$&Br*=&%F5J0dvL)lMP{1#PXON^*haH
z8ut)iUr(E3z+6Aq_J#WMbz;ToEeRNAmT0!0&$f$>7P?1MKjhLbYQ&`oVLH2S7Vt$h
z)NNp7`QyRcM{(>~N>pumzuc`XVY(=!u**feLng*Ov}~cWIY!CZdG{T|^1*XOH#hAy
z-Er}f(no<`<0d!_P}<onzzB!7S~FHDUWw2q)=a130hS;`sk82j)F=i<)d<IN({2TK
z4c#-c-M=Hp@cTsc%HUPgM+bH89R>Hk&&?-#<7J_b_^m*n1y+TN_N3cIx9HS%jl7{A
zQPK)@%WH+bXB&;!ez~X-19R^ji`j0TZ@Qm%HG?Bn(Ea5wD-ak0GK)BakIH(p6ka};
zwK>&ce%I|~zaZ4^+r0Vddn*zW=TBACLxP!9vJ`q15?rSHb>|-awDh~&{W3i<<JKMA
zy~C5vlUB+?jusw|GD6&YJ(aUK`$4Nj<Jw>x1-2Tl%?MKFTw<v({Ceg}d`OBxuk{ix
zu}wQ%>SYt{m;4>9rN*RIz9N4+$#}o*zXzk9KsnIK9J%A}73kn4Kj8^fgri?apudud
zrvJB&cU6xrbNBIbr3R5(z@Rx$`^Kvi52z}_9KT0b%o5);&;uR;Z6x!dKBap(M?J4v
zz#TcCz|)AL>$|Y33^L8ei)wX#`9jn=RvymZGvlyfDN3mKp#97?^D{g=Qa85jd%zVP
zM7~Q}oS06E#nX|9=$0eg0_;&V=p0ej4vN)Hq$sGiQ{4G!*IQpJl;T_fs_<NCs{CL=
zZ<ERw-D{V*-xqO<mFzv}5J<3odRbwilwr_?Ri|&$xAv7@N5)j7v%FvVq|?4XQQc3d
z9Ta%0zjrf__FWz>ET5P{9LG~aI^Zmgfd_i1iQ`O&HTv>Lu#qP5lRszQw7*7YrH}qG
z#Wa6-vRp6kQNQQtLoWC2cwNFh^zRWsUp9cg6pO&uEpKQ*ZYgE#Lc|)0(LdyPQHh%u
zWL96|Ui1`yIl^|SSrvRe@O9uu^v+4CiAQ<P%2!~-8oLz%z(J+X&BvkeAVpTe&BVAd
zG&)<PSjH4YY$LM>dt2npgl~NYfG@Q9yUBl0J~&orySUO_8nySJPJGh59Mg-NixCDg
zq9PyRUdxfGu9M!m@`?7eqEDVOGKlYIe;&8kZL_WGYd~=9c0g`X`M*Sw5r_a386dOr
zPhiX+$Z7taq~<@qk?#qdT>$l`jvXW}dXK_?T?I;(|2$}0gWaoE$<+aQ){+0nvu<R;
zq!?t5B#eqrG7^9~h*B*FF$Y?pJ_anZS?9(4F@JKJJP>x+UTq3FmoVzK{+PeKD#cX6
z-ov#47yH{TB*26$T7vZVubh{>Yf|iOfY3<To$Ud=SKP*Zd3%}kuEVsJaruyQ!hxB#
zW9MddHm{tvE;KKgl<S+%J1^$T)`kSzz%FA1tIK_91g6vpVnCdTG%6alt*q>~x*nHT
zIdaryc*$1NeDEMier>Ssf+<l4vG3e9{CgxzWwn#Y(h7+LOLq=YtcE;}r1W&)x1*1V
z99#TRpY?{@X;;m6>-Y}M(^RR+swHO+UY?r*dbADEy6C&e4zY<2Dy&0$V!-P2?G2gs
z>t5E)O`mM1YWk-3595;k3N8GV;{XUepd!TmMw~a0VK*0>tNPEQ+>%JYyZRY{D6f}O
z9sXJq*B_`PTNegw*)DFNc(V1k6lWd_RNSh72n28^T*HB%>8J+-;f^&!^4KNl-5WfV
z;q$n43B1hAUpxt44zc{$Tjt23MhXl51xSuDBOsMBF{f3uvXpv0=fDB^7Z+jfZ%RyA
z`*#ZOIijpMY7h9it|3^;^@~Rt1v3MSmOi3!)JxOj)A!8S7el*Vf2v{oWPLeVFL5qT
zEmQ4@dy@AN9v+QpP^r7jv}cI~(K{%sG!vtUrXEK^f-X|f#TV!MTJBeH-6u7D2$B6&
z?29@|$6Y)yVT{@Wau8t~Q3NIMu({;T;NMJ`<n<O=juG@euKjLsDYbtet~O|2%Rrgd
zlW1owb8eNB{A`7M8*^`R_L00DXFCyK0oe+3cpgXwJK^{R&#h%tU#x2$4X6z*KkJt{
zOq=y?sPoz(Hf3J?UT3Gl)3t}Ypr~dP!;F&AafDXHN~UUQcPyYMv_2GtNIgq?VAFo5
ztSQmOR3Mn|HQ{>!&=q3qVc#JuksOIbfHlN@8YWN%-wt+#4)a-wwJ%O^7s?l<pM9~|
z8Bvt7wWGGsxofLiePSW(19>bC5VM-&NJt8ImF5ptBTrUb9&2L8AZfPAVC2~QRa^QG
zeVuoMr@Z`768m=x$|hgqyBS4_lFMv`*G~9lhxVbDHNQHzr5P9GfkAv|D@gLs#eV;d
zkV79nJ-Ea%E}v)G-o@h8QiE<8GJ3X&UpvsN5PjMI{T4w1Wg+MmZaNw#i<M;=GYf${
zswmVC9ZKp!3fx9kq6uvVIKR*s&E0P<CoXki-Zw<<JiGU%e%gVX!{SB<K%Y2aJ0VGK
zW6M$OP_8Jq_y8&hAE7A1DlVP3b(M%*a<bH*1+V+Kr_3)eF3Tq|(OMMyOTk0eFpNho
ztx%`oQ7S@|)p5FpVfGM&rK}FE>t&IaKG@_676c^7N1v_A?Uc9ud}p;CAAuBW5#Ssh
zgv2rXhLr}Jk3MtKdXD!H^fcddK2|v0{kL+eYxucMAT}blQF#N#dMcJ%KwNBQm2Sw^
ztwT>1{+6b{0;t=_MDJy`wh>rahy0B|R45vweqDpE1c|h;dKjSqsVnhiwHv>SV^fCi
zut&)F+V<hf$yP6%qus+km0fk1B415cK5*kuAhP8*=K>g$TXAkE1uO~wDuJ2C(56O3
zb5!Vk=NTvHH6Jl@K!hO(_d>Qj%b`{-CGlG1!M!WHPx9PwNxZTDA<plU-A{l8*jx?J
z^b^bzpgoH?z>x@ZHKCTfcR`zmFeluoWRaHoo1=ESZeAU{_Q0b3KH1-9kDbqU4L6G`
z*e_gQ`KR$WppQuD-`WYd8X`@Qy1b2}K3uNV(oZ>|b|J2#(dFqN%|WHH*;(#kGUV~|
zbgx{c@PYmpJa89v!q-yfE><a>%{&cm+k`e_!njG;!~Kj~)5V=A5hQ}SFX%<kSdEMC
zh+JH~jur1%QOf9%b9~=ld8kN5)q|HeH?(3~17N1G6&=C6&do8}<BZ<MT5ct!m3NJd
zHq_Uh+4Zp6xySE~yMf;}%~$5+K;}=70MC3um;v#bJ8TJbA0((vAMGI1FCfPeGcrlP
zagslBRqDcB6g5YPHMm-T?Q;orCtvM7`tyiKvQX9A{?$(b+}q%B`=H6ccqDKhh%x-R
z(Z6^+$130yHw@x)&9W%kWhO#3ofP`}P))AW7E|25ZPtV%>1U4YvEsqL8=W@e$lYTk
zQCE9F?arI!1S36GsXZg|RqlaL<sdEN)RKLnVS5k%OZ(31rj{;EbLZ)6)R&mO;Fc23
z7z6t6jLvZODU5mC%#{a)jf~sJ2qZ1i{8hqUIm?GAqtW-9pUgh0$$nCO@8eUoe^$cs
z{7PaO_|3q&mJ>j}u@kq0<H@kCq|!`#sGgN>{z2tPE1a%=vcufsqk<1hed5)Xmp7|x
zn#*T#R|&J<A(6)o^;O(h?EWvV=9R`WT`qxBlMyyHuZAs@b9pwk1ahAyMS&Q`dT}h#
z7de5LaUn+-$(xn=?lKEEg3UR8I<`Bk`$QZwyXTU8b@Tn?7>VFpkt^@I#rf7h<r6^=
zPA#O1D2eTd(Ju^0;M#PeI&E3m^e+qT{WEa>`SrBLyOBjc(4$v<&$FX*I$sp@2hMIC
zKXb=Mr~|$SG))i9=Wn<ng6JzY&Tpi)@I`{<@Z4`?SLB;p&%eCYJM1H_dtpxh;&FV{
zsap+SJY#g~6yk+i_DzZ{1jMdQFfVhnFmmj@7_lMX!wUmy*z&^`SxofO?+IcZ`8d+K
z_a^0=(ESLXvlWt<XI@(A@H2oAx!`Yr<yR{F$^ctCR{>1x?7L8g#(#2rzx3k04I3f)
z24u96K6oaa6}nND@@o|NzL>L7aiQKa3}3|2SD{hDTt&kI#`MvKwD#Lq8*+RiX5d$6
zeaK9`g}?LQkN$9QKUuIpjwLpC8f?Ig-~S20BBcHTa4Id*zAVv3)blScEuAhYiBGYZ
z=273|yI;;P4p{Uhnu15{M(kF~Luq%|k===DAVo>%W4_qPA%ywCYR;E3j#*$OoLRvq
zj`g(pmRBABi)U&n^Kp{q&3LMfkq^20p{(bl!PN*!8+APequHf<MkMst+zC{U_$zHi
zM%rXS+S|F@{6}#$)|ZJjU_Q~r7L$zNwRf-~sju(kx&|*zj0T`T=w$IU{bybwa^N2-
z56F+7afU?t?IccKbq|>-1yo@v{LV(oK9(~vTv@aXbS6O#Y6gqkFUQfJz)JIjNJm-%
z&MgU0GJ|-%)u<j9_Fp-REHb$8CTT7VH1W*e!SrwkVA<ar>EOnFKr}9O%<fkY^8NQe
zB>{lBe;FXf+5vXCWQ;q&fC2-w96EzY!0sP0bSqPftJUxzjx0a_bl{9mo3E{o=F)Pd
zrMS$Ioks}dyO0KP8u%fV@J`f3DpTy91Qp)}-@&q^1cVcij&qO#-D8yL()D|7<$KnQ
zb%V~^q3))c?I)j@>}%#sd8e|i^NCqxmWtj$r}j7u5(;8WK!T_s>ODj!wE3fMsJ0Xf
z$@cyF#l@k0!JDwlqW0uz?HxO}^4;E(Z2Db`xk^g{s|7oX8OExAiSgqW0emz<2JJ;L
z&GGSPq6FuTJ#<wVTAWSt?bZ5Bot{)q`l+2IZ9Wz45bS@FcYV$Tr$z=e^$A9@e^aYU
zlV5o)_A+AOamdZh_b44NeDeKo!nYfxy_{pN^eOK^NR$O0*3?{zjxU!}lY5l*8#&Pf
z3b3O<j(55VE&@z0ZhVHSLVmD_ai_oT!Kr0xzH%~VzOh<5;7>D-7FkW>u~$ysYAS@7
zl0ESEP8|k_!HEF7)d?7S@LO4cxl0Npx~y5@1Xj2?!zGOn{fwbsM+);DvD45@RC5zg
z$~94E4?bS&y1z{ZUI7G3mnXz#WEQL0dftGLDcj)+D}mwODV@9hW59Sawnt?)<U`X3
zz6a5Rm38_zdY0~=gxee9ZUuTwTlubhFpae@JK&-qI$~=-7Pj?=*>$@aHI7^l!;ebn
zCbCqSnKStK7S+!fr}AEc@2k4JvM>F(+wKXU_U`oZx>sWK<AZ=NV?vCzXQY}QJ`HN%
z4|fo2V4X*?x@~TOcMM{d^z?Jv=YMEF8AQ0Lr+%n)Z<d`+9&hf^bc#?A<H%_N%Thap
zTf8~~woD(l;Q!axAmY7+HQ*|QBWs0EC_#eZrM{%_wP0q;!O!24U!DJC>390PfFbXg
zT^RO5mKO*sPgjlPDsjt*0p!?KNRqCY_i}0sE!QgTx+mN`cKBRv@?{s7GLv0Sk;yJ0
z7O8D*<KOsspvoVF@5_*2IU}?Swh0h@05TK0m5UvfpWG$3#VoIFCTY_`V`70~aVY=O
zlrovROOxS^b`Ia(8?!E|27s!~`*1HZ4>ShSgd9n2W)U~Nfc3H*ZAKA@Y`!ttvOD`+
zbDCb}-V~?T{=Z81#p8c4Q%lYb^Pev0vzdt6ox#WE%WuJO)4<po*aW@B@z!XMlPN`C
z3hw2jLup7)yzNsLR|C?LN(1iD$ro~B8vJXflDwpr@c*HR#Xk=mKVV0~K<Z`OXBe27
z{CEGB_j`j}xdSpw2r^si7!G>u01U*1dXE#2?0`2Qlf-Q1mf<{*LKqnM(KVtBA-onT
zk1qUCZ-;R6!IsNM=N+oQagN9KI_JLPXZA9(FM}gLYSiRMA&-xIZ~tf(^s;B>v)>U<
z|K#2y_MBQzNJ{HI!tF9sdEKcCUybC*5GfgvsOF&Q0g_(_d3&wG0hxxlj`gJQ#oH{x
zdnH_oYF)fJ;Q(3dj`h!b-zZTNfIN&mQ85Dvsj%gL5Nm9!3)}J*5FsJYed}xqvrauh
z-mLc~F*lOrOex4&mkI0@zWP}&njAv_5<SDPYfo@$hoo8Vbhg#5oSO6ugJb;5q#*A-
zkKGDuohHW0S4);!uYCRL&l5j7N~Rmb<*>cGMi;e<F~_(mlo3rWjtnlaXo0Dy6tjC|
z?}(^!&XT*??M)ZQliWy>cQCpECA{mu3m5L{zX?F;9)FE;vmJ94ls4JMbL3sh;jkn5
z|M`wpiX{Q;E-2m7x{YVc)+TXfYrForN##MWg19`W?-#+LkSwj9j-;_BX5^==?-WFM
zvs@rmxPQO!jK4_t^Of`Qswt|b@f5f7vd1F!oL!522`XEgchXaVFPexI!nGNI>U$R#
zjNmjm+VyztQ0>;m1I=qP=FP$B)*8M0Qq1EsblzgRkAIdtOk<OeV~oMrxdL^#?g8K2
zap?WAo(G&|K0j#7YsLvEn_KikJG-P$b8&dB@20BrBO{YXrpL9w+DUgtfuzIYJvy0c
zw%Df~#@WMEWf;W%g7M8G=T@ukkVJE&YIEKY2994J4~Wh6^?y90`K_T);Fk?s9BPMG
zA{Q+S{DA#3f<>7{bz1p0)^<83e8?X?WM8`GltuLkdn|F3lr1nYTy$>_3MiRJj3x|p
z97pa1x)zTP?->FLl?Hjqbq81Y)-rAR%_rJq39U)2{a~5xK(dsIITHTZt=RTD1DI%3
zXOcuuk?{-Cyv)VkqBm4jQ<KG#*PZ0|(p24vy${vU`%N$@*loDB`gC$N2QTUm?enfT
zGGreffF-GX3K(`i*2_P_de^)~!j1E$=t$e<6?@p|RDb`QwC0SA6wM2rMSr`aJC##g
zdF+I#ggB&7v+%}r79_^h<jApuO`^`RgBC%g`+9Qa!|?YWKkb*CH}`%GwflLB0+?{3
zsIUzn#FjqR02S5s=2R^T)O?3p57r9L9lF-wC!8mL(cFBk(A$upQx(!;zhbTk(kPq3
zC(I5b%xYpAf~Z>^?mDanW5G(gRrq69WO&e>(A(*aiul4pvvale4^+%2u6Cv%W;HZZ
zYQp&)>hI$$9s?ja{J&JW24$2+;v@_b+e{=c#{;L3=l_il9sggbT0p*n)dET=?M@_H
zS`WunP9cKIs(<8<>hc;S#<)xQv5+m?6Uoal1r4)`{@FvWN$SBXqVer`KF(o+6@@H_
zHs}EHv@5an3DZc+(sB1~fH}4xIknO3OtQ5;(IT09X%P98Ts68FLqPzFo(Gs)<vk3b
znyE%N)Q@@5QH;)FXb+p#4QmlYnRkRV?!U{NNK3G}%s&&zxyDTZHk1r1V*)s^3TO&n
zH;~A2D`Z)A-mNHbpC$@;AIliioj9@o^N7^bh%buPx&iUQ2fjyzO7q*CgqNPxSR-+h
zAQcXZ8%~@q4uR1v;c<!wKey~gOY|Jmg!a)@%c(mzlX5amb1lk9Gq>KfKa6zF6x!~-
ztumhoH-ZKmQGh(40}`&D5gP~Qy(__tlAb`-j?%w4zgp)zJZbDD+&RG9d>{@LqO@bz
z<UZRNg)xt?*aHH;5hggW&8y8=>;TEuHfbOq`2bln3VbBF?p+lLR6w{t2Z=MVF;oTf
zdEt_YvrXrUzr`4=opuhmSozMnV5{Uywgx1+nbpAXYIWVo(xI;pcijaM!iSI^!_tSm
z-OwptH+gbwjJii&j>6xLkejUJHAg9L;lEktEcWAWEXR(U2>8U1RjVC9jjlzOTZ+57
z9Pg_2$xu%Tp$F7{qKUP?Ty9(L<Qw>~oO@KLVXZoibkcA%0-e;=q7e>>TGK&~Pk&k(
zHqEBon(Vz&eWcDO{DWbKbg)eOgQlIWijR3Wg7O1!j(qYAY9~vWY0PpjL(AE{84UQ!
zD#~yFX@8zMr0dytK}Fc~GVNV>VkX~E<1*_=Qje=43&jQz>@K1RDyzf3jun+B7b0#U
ze)xLB9%UV=TN2y(UKp11V1HX9-vg2>4M=bwwRB)ysRvgl*0~(ta%i+gAAQ*P4dbG(
z&U}(rKFOkZcXRqEqAJmF!YSWmcfAmW)BT0U9VdbvJ&@~!aikA3Rp|Jh<%u5Z_vjfb
zl!bJoYQi)dzgTW0ag~PR7TnumCgT`CbY`jN+XJb0-LYFqyxgOMNKpQMaycF^A4H5K
zO5;3WNhCHJZ_kpM(I5zQymUUhGG*x>;HqNxWhq#xm3kns`rG#azZvBCgcCi9BYmmI
z^(eOx7%0HxoPxHa1SglfqC?w^HV+52<!g+$eM@g?LO-|5DK`F=Yd$r*dH(}`8NT(l
zd}0n_OrR%s4>tu&%)&;XH&KsZW#bs`6kKfc-dqj3KU9R(b1GM)j)cU1NBBLKvp=--
zIG(b@tvzfJ(SsAjh_EkW$AGXsid4@u8#3DCrzpt+;UU2-nJ-u=%jbz%TIY}Z-Of}R
zu9+cvG$MD3k)H0^a%7XD=#aD6LdzeDRo3g7*4!tC!fY64XEh5(U$FjM>vHHFb^eEo
zU&!8--(Q=Oi*Bwwv?3ptzjjSj<J&)J?;qFgIz|NSyqtUj6RdMpL6iaRo>-!3>CX&E
zkQaEgrQ96&Vh;KifI(a(43<y~{<PnP1K)SLj<^z|O_G2*a}a;=n2f&JU<m&v39m+R
zQxV&6b1g!2u+$Sc`IgLwV1uQ03DjK#EhFh>_%#!;#R*_hhC5*EsIdERvlstnc>ntN
zzf;N=D*#N&b?ia37!wC3X%5sw+zF{q_IgkTM4kN%veMpPu(<KjX-^$lJ42{BzA`B@
zY~S%yttT{AJD4iKtv$p7v4^X!4?r0I;OGpJBPG#sHz^uEO(0f=B)h5nE;MKyAk}qV
zOK^7+#eCQ*bn*ncRge7fyDn(^5;ENmF;=n46k*^yaQAbM4?(+GL~3wSASFl1oc_Dr
zb;`@mv@Rq|ciYDZQ&ie#mu%k`{0|gE+2^4TuDiLZfc=GLH!T)Nwbx6KXa}<52P<@>
zzA=kMCK+@d6_w1#xnI4qxbIj#?48)R#4U!QfeZvirbA2^?cN!3_U(FlMj?shObd(>
zVwe{30ce9W3ep^{K`*x|^0A628Us8Wt3gdevpnDZFGHTsrw>k%&X=^^PuXdc-0HD;
zQ}Jd#QXKEfcaB5|WIaWffO-cah`+{B$cM;kfgZ6)BHaQosS#*mfa;<u6SIc`PsnOV
z_q1bG8;zyYA@>2p^lq!X?Ry$^Hk~dpRIyl_QCojZCr5IQPm@K*(8lf3(j?o|A}13s
z4-YT#Hh+KrO%D^*3r_A(I>T!%$!`N7UKJA5<yK-C5ym-WD90FRcZA357Wpsul#LsQ
ziW}*?k+!*#C}8TmV{GsWJR8uHt#%K9KmpxWERoC)dLS2H`Jvu_=1I-6)C7yzb$2!)
zP`=SKyLfI6)I66<+mmb)9b)+>o9edd1lBi!@liD&#ncAh1=tV3B*10oLRN8NHKD@G
z0<7HiR>QFN<BuCGL$dTPt?=QLOSYVg6u-Y^ylHiHn!NavF2M{Q^rA1f!uh?i!j(<e
zLZfe6842^}%(#@Stf}X^N6g&%${u~jDs6X_WRRmOFe)?m4h=gD5iYBArTy9$h}s{v
zRZRYV!M6K1%zQNkes+OK4FEic6=FMZAg3`6(lOhOBsn`7pEHs$W;0q+dDX)l{f-M+
zTVoFVoZ1ZkL2s|rdE6U$s{Y$rhEaGCMvtM^wahZ$JI@N8W;JuqvndLtPF;^(mibHi
z>JPjv0BFxg0&~bRh=&^wza|#pXxos-In1)7C~O+&97&m5Z$5^N^9smd-dg&pwY2HX
z=2r)=8Wz9vKOUH<F@;g1XbYi*JM@i&>H3*I`9Da`g8m+EZbD8@PR6qBW|GzsXH4Yz
zFYqX+zuG{CiZ4S#WVSF%4kY906K1TKnLZJCmy!73%*OzXE>XW)?{oC+7_G%mHL2MT
z6V6#FT4#P1G&?Q;2aYMTqO#17SnHf?@UcSoc)u~#X0krZG(YY;skSGpfAhOTMH#^(
zZTmy7*)2&wVE<q3eR({T{ocPy5hD9;$`U1%HIb>TCAXANVk*fpq0LgpT%|0Lq>__j
z%ARB@J7uzmBun<p3`s&}&~z~~m*02pbD#4(C(b#Y?RlQp{qu*Hx@N}ry1w)M?C;O}
zBZYqrba<RrtrkB^opwEGpd<s3XE=Kno_4+4cvl8vcueS%&doP7eRa~R%M^;JLSZC<
za+G&sM+pTb=ygGU;;Lp7^5k@8ca4t*yJb|b$|C#3jXAE6$^L04p`irMWcPh6;0T5u
z3L*?RvP4E1T^*9I5<6lCddD5NcN9D?Oms-k=(7~qtlhW8Z2N$ap~AlG#&vz+Hu22!
zR7|e{1H?z65hDbUM#prw)iRPdykATE1_t}7q#3MuyQN@Da-Ud9*R$PAj+WI2)SeRD
zQP&QJOu2*Tn#8!7SNM$mz?kO&+_W)|;g&N(b2XM;Ls<WDXUnkMn3B@vngH-LBDf9k
zdg^hSK?PA@1njX=+$R^b`Sb9~_s4HdY>MJG6Zr5}+jAKVoU72|IAAD3cd?9UzR#GT
z&|6`PQ=y_)vOMA7>@WI(o&6i%Ci!I_Vy9R{dHSiwuerP3NC{m=UcED_>J$2%>ezA$
z23B?<RyT{nv0%)NA5|J>ozk;P$ElRomvO?{`feaMxg>7jyv8Mgu-jwYa>9h7a!|A*
z=b^_@a_aQ&uM`yMx180ilsj}kYpI0o=;J<P7546Uve@;^b}$sSk*Zor3{94>U51mP
zy}d=%5HVD6Z<}^jJE`HY%Vdi8f!X$^zQJ|TOJJbkxVHkG-NY^h<?6@%&`3ZgriFc;
zbg6t5Yb|^<XS7@i#!_8SRXwp$zD-D;e}+?)mwxKGH*Zdq2E`rn;x|f~lsNg!@MCA%
z-iwF_z)mmJs!05VVSqy!PXGyQTc#=^$th{+lQN<PlE*%r1jY9Gv;Y2#9#(t6sg^DH
zL%NHBb<2>w>{O_$h0~fs9Lb({?TIJL%+?KX@JFfVaDd5w4u#aGMiD*|a1C1FVUEEf
z0)sV<d62)CXA-2SIkm1(DyQu3PJmU(RhWw?F=Nw51W6d*8jO9iwQg5wv7df$ecAy_
zUM06eb7O<2!%?%U6qf`r=5M_ROT4)fL)*JevO=cw60#a6-{Pkmqxikr!7trgW_+kG
zaIcXmY9meQ7jm^2ch%7kgQ(Ufu^gVsm-MMYlL)YUIaq!Sj%Adx({i+?3Bc8mF;?<R
z)rx#r6#UpzPhIL{-P@-t-BldkSPo^Y3#)~s+ZO~yj1Mu7A4EQo3IK-*-IDS>^xLRV
zI2bNcL>*PP6~_h)+eROJ+5UC>1izclnkiRaYEgYsVQ5pn){=Y7W$n5jj2!bqj2w!t
zf9c=j8)YvT!K1XnV#}HXJwUYrd(HzONcDGdpq~1im;#LmreiWOv;*J^w@9c%Gey2Z
zY*lae@)U5c*x(6j0VWU!BrtR%ZX1lwjdw<g!y?0$#hS^-B8fLWW!J~|>61S6>9i@5
z)`D)fo+*Yac(XSPZQG@V@U>B-aN+~a%bU)WT3*${ZDVJ8<&NlBD?fd6%h~kQ*S)uR
z)ufkZKRT-T`Y{{t;Q9m%JNQ=GJ$mOxe`SuGGCe0sO5pekZVw(YC!uB{toj|0_pd=Z
z9r`!Y6UHu6`CqD<wrCEvfwr@>cj#wh{ue8*kFMXXv?uO)e4P9~t9!m5m)syU5#cpF
ztlPFdgGfVpD(IM1C4Qd$^#J`k_k{z2Ux5CYlg^jr0hoR^(-P`FPj>{9V7{?8x0_N;
zk7C~P%&)Y8F<DKB1WOh+r5RLdP)MP8@fH>;Rk!KPNnLBHb4)j#i!!!=&98rFmd3Rk
zJn4I5U&US|Oy-el$xIaq!mL6xM)w0;$JhtY0P`0N4^f3+b&soKx@;lplou!z3bJ$t
z*xN^DmpQ*UbC<Yu7Xy1RZNq_tW^(c2)fqcvp3iaF6{1!kt4A=R6L|d*u>AN7k4HWH
z{7rausr~zl^IiTBZv7zCBl5f*RXU0lUDvRAm@Ex272VE^=#G;(OZo)0F8qai)=1Q|
zo)6fqE-%Mgs&kwy6zoq-7HwW<Jf{3H3Y;MR6r3shmWdJ=J^_ZM#ng>Cr+RhZSArrr
z(Q8+h=b@7UGJ4~0w)#HW7}`WNi}Js+J8SH+ce!Ih$W`V+sG%MlyH57i3lm5jR*zYi
ziQ|HW)36G%birGcq!k@N3%B%{J~&*J(VNy|AL#9HF7LDy)<1ri2yavb_vOU)_Z*eh
ziKuvCqzlM=pTC!HdiNmM$G|1IR$5Pei@T%ab^bf18?73Z!tIan2FN!_ECZ!AK`&58
zJM@Lj;1CVYZhjmMj_5dRJsgvTug0u|youos+TKOzWd-A?8iNkIkWX!`x+1$xEYYQL
z$20@tt)gF(H#kmARFkg|_{#O*gD+Om6RNh_MxWcIyz8EA$H|AJ0&UChQZ-w*`eLYC
zUXQzEGJKQRNo)W?K<bBPOe`vfW`cV2d^Z=#xk=T5!`v7RfIR@a;eZNdkfig;S4#u3
z{05`P%H`X-T!NyociW}L9rwf<EOj`h)VQIV`2D^hmP&;6I(h~`{ixv4!G4PtYsr{`
z<L6En#NXlHzJuqX{ASGa{v~Xnb=+*jT(|i}K>~<akYyF3e4<}Z18tFCR1)(n)a~3g
zsYXcRjqgMvBNn`uA1{qt*ZTDwIt(w@%F=NU4%A@Rl*fCB-`@AZHdfoi>VS9}Zq-|h
zPp|492ysRr6ae&PBy_wv2D79f#z9JN#$@eiYrmIWU(TWX5(fGK`U8*}N?%v=`qg#s
z3?6a~S)l;8+ZGcJyVE9<mDqj_b|{IGRW3;D2%|e!B-V5?eDel~)BGWjRq~$RrsY*)
zmyKC*`8i|P^d-6)QQt-ZN$M!Ywh9yJx)v^=F)L<Gt`0yFn?gM7pXWvKKmNoMa3W4j
z=Vskmj|l_Sb<Yufd?PrQ(71|+j~jD)kkAf#;z5tGx`S<$7eU%PO!mJHOMPqDWnsw!
z4s!+hX&onL$tKn|3xnANwvD#DeK4uALt-S`6?SzyD;JRX@Jytpl1I_)%|%8X24|%Y
z^ozwe*MKb14?Ketww0Jm&~p6Jpr9aIO6-+<h<D`S0h6ZGZc}$z?u`~yk=(%qw_CUF
zs6NsQtRw?Dg=K5ElB+S?u%l%QYsbI|%MLvwSt*|hy%n)WtKPJv)NekVR90Ub^gzfe
zC2IYp1-ot`>-fKRoj<@NVhX5w9-xmKwDOxlR1P2?1*5ZA;-~!!)T0l7D_64OSn=ad
zj+12IdjeJ2prSj$l=;@omWpRvEp#>B@5-kd7SHrUb1}!YpTLO2?LYp=`Bh_PRC^}a
zmvc$e`yD6Eflb*3#P2jtn_Uc88dssJNregpIdE0QtU}(SgYenP^;xz@)4MD%Z;AsY
zh+;R5JB2=l&_s{a3A_Sm2{145BLPcd2m)89#RgMPWUY?iAUja_M(CMJW(B%uze_32
z<UwL`dRABemjKZ@E)Va$MmP}B7hhF&^xPd5C+p^WWVOMTeu9D0(4=123`Q$%x^C>&
z!!H<`H5pci!QGvKh4z=1!`tX*tImK4*)j({wjPGuyd~<5Zz)<)u5HRM>080CUhy(|
zopT9|#LB4}Kr>YUmgaN7`+9tRDUuFUxE}aBl_2>5MsA<a{NM~MgEh4Tx0xDq;%kqU
zq;vWGJvZwG&Mx1;)01nUeX&ty=kpDD@vcQct4`(#MFK}IzHW}|Vef&LtTD$K1zDyu
zm_27%M}oJA+Q90mL5&A!uD1rzVYJ@SDbFWEE@ZFGF4o3{>iVYJ^YZN?;!KV_$f6_;
zqA6*Bo)LA0OtXeAIRIyd1grWZsT3^<oYsrB-n~)QqtUPRqK$f^gLUK@m5L6!$XHc4
zCCQkpBSbJW!4+x62tvm}Z|)ZB9dHx<%r<kJo;$7SXuFo~=z72M3W4W`WAmHk)~2nt
zIQ=qCD%Vo{EN2o7p2hL4<5?yxXpmr}*;<3Y;AEl*6SBHY@*u2_+OyVkkJk^DYBwrJ
zD@g5<I|*|udw%IU6Xk8(KVJCsJ-ff|Jw!VIhI&axvGkY<EX5*t)d1FZDHfft*Ac%e
z^XiE*C&d=s$63^pk}~J3W~aI0UdJT_L|!>lAI?RrAWNZ-kzmE)Dsp%4UOHy1f<9Vx
z!Lk-8yq-SnR)d)Rb*JhdyN<4Yc;oz+1Pg7=!zYi5txX897jZ&&plJsQU6!@85o3>O
z*0tzmEKQP#CdmWK<hf^7T4a6K-$tTeZQnW1!siWgJypZo6;FM3UdCVWRF~LmgsduR
z$IJHxO-@xnLb|>8i~<rvM7~!zJpJ-W@#A>~+pdQEvIJk@ZcCn1uHJ5Gj;a*3u@c)(
z;M-`8Ydl7}P|t*_Vtdr9ugZ9ruWgNB#NH}u#Eu#&bPTm?@e*}4E6^9^Up^kcFbQ4K
zC-YHYJ?g#KOS2lJ2%J)I=8lkbua3fHd!qQUHKJL}HCC)}qsf-r`=st)`}FcyNqTv>
z)+wM$c>lZq%=x$a%n+#ara?oil1=8gp5_84F_Tw)R%QXx?8gz+G3#-s(Wyd9|855;
zDRtuSzp0E`x-IY-dlLZe_8sB)VpufmZbYpXQ^Q1!!{yKSCoJ8;d58-zp#sock?sq@
zdU=RlIs`0V2B^)XG2+>%&U-8k*nk#6@e6mZfVf%esW95YEAeBO=bcR>j9L#a-DG>-
zt1rU#I!mt++MQxM&ll4Sa;`x?_XF~#POkd^2rb_aSo7Ya4zaJ9aM9|beQRRFJ8i-Z
zhnUu;I!|&YLy}SxL|HjSf>czj`fTD;&f2?SLNK6mq~*r?3~u}7os%n;?o^qWRFg7R
zt?j}sYHZpN%lpRhqpOKs>}J3jFOAe=@(r$@WVzFOsl@OXnCvdK>-<kJdy2|#uUT_N
zLs7oI?!!r95s3I$mGJ%n!q`>12O<b?3E%v>V$GjFrRm?IHbJ|u&v99{u1HAhKU|UN
z{oG1}uTJF6)0yn+4P~aM)W=b=5b!xGw3sTA(5ylM=A|`Fl>z`BJLr?JfvL<y*wH(^
z=Is_qevFs;((i9tji2Z)=k^b9aB2t_*g<&@GNYFvUVqG023pqN7WiK#I&B)O!jefE
z*R&q>_UN;!#9M3)6-|y46rW;}ia&z%ln_Mu1hT5RPr&yh5exipnG%rUucLmyK5nuC
zr^(L6txV6&$FGDN+A!t$!mp2GZC>vp%b*ih-&Cv4>Drqn_mK;_P`-|(3?HBqE1M{O
z*UF_^an32b6!-Mt^W0alAKgS3*RF4AI|H)UwU*vrbA6snXnf3j5d->B)jm@|KWYWH
z0<Y3FyCY~+xeD+^ywQ~V2IxGS=4!$!ocZ%kKfNZyls1-LhVa<|>91c?`Q@|P+GifE
z544|Q)C%HpOR>XbSl=13!-7nGS4|G|xrW$j8%}tZFcwX7t1Ci@A;P##w4!ih%X2xg
zkGw9}JV}4sDRqh{=Din94+Cmvpy>j+ip0sctPTnShQJ^~m&89};iYMFC(c7Wo|Lrt
z&CT<tve$h#&T~_J$pz^{G@qA6VK@vhmjiGNQwuATqmNh3r)w6tJsycxlx0d0eU4mm
z_dHv?*4^K-w$`(IN5fN;7*w$d4}`8_vx%rzzY-gm5lS|)nnno6>%r=Wl2uCRx{rNU
zJj&?1a;L{hv#Unnu4aYUvjCg0pqc%%6M!&Q7vGHHdEr9@6-_%zh)~rbP?wO!KY=ow
zOtISreKq$GWgZ=SYh8$+lCM)uygcYJMq(NutquVl=zN}w(<tfmJNP&LNhbVtmz70_
zj3W;j4<=nXekv&C6;7VH92P}*flE@05&Z(ZEoUv>zM!Hiht<8_o#nFYsK@f#U7|Mh
zEq!Mt|Kyj6^<BA=0n2hHu7YQXK~LT!_YNX;HtOJXkBTz~X=Rjz3c{sXqo6<v@ya<2
zuh&FON2$sc8Jz0!Ke(?kbCvFz{!-W8-Y>urMQZa^e-L})wgF*$Fsh(_SJS_Yy|q@>
zIcc|NWBko8PI0b*`;~6AN+_Lmd-NxV3vlgUAbp$y%M(b9L!chPR(t*%)yh2XWe^!^
zblV;mXO)1>Z^Ex_VXXyVjn10nwfDwm@d<<P%SsZwd~UpYR#y0a`E|gSx^h~aZDD7>
z3swhmQBjMg*<zbyzS>^qA0R|-t={yS=9Op?c24=`-n0~*-H}<cJfDX7(grUIfF!Re
z$cNQ2!e~2LC)rQYUZ_>|6d#X4aum5*ACW2+NZ9I=-FSAx%<D#lx)UYtRTeknx3`{o
zr8OZBT0H?vixNahK>>f9l7L!i>;cXflU<bAVQer+=7Ed$%lM=nGJikJ$k93PCDId;
zxL$i|PwY@awQp|N@eO#4AF%4j!L*!&8ZB7UuMQJKka!^(Ok*%>%a^u&GqT~?uf*Kh
zX&W=<qhZr3^i8KxytGkS^$&L$3g(bq-bkr`(?^-GrWa=pANo3Z{P^<0{!P8VXqU*z
z`HKetwMKz65r+U8nSB$4Chxui0}#rYbzN`9j^iY@M#!_;d`N6o$}7AcRB;+|Lq}l>
zQDe|?($bAe?{PjfrfWXkA>pVxR}sN&mx%MJYw0Q;mJPMLechJ@8aaQ|3Ihx*6R^_w
zL}i&esYkMs0m=b!UVP42ykH6KN^^){cDC7#ok|69(hBB^B9rU+_BRPX>p`b)r~!~^
zzbX=7?7P7Z?H!-vnwlEYO8OqaX8AH-_>a_Z;kEJP<jpkIY686Wdv)wppjT7_!5H6a
zys(c2*^=}$)~2lJ;K!ZDqe0P%Mgj%`XLt>XIcUml)MSj-C5WLif|~uMV3z;aIkdkk
zc))N;VbP1Cn{Oxe?nv9Gre17lsN!B7wC4I|Onn_oxS8<)21#LVK$*G@$iXJaCn*5E
z1Y4?gO-s*K=t~cIcHP}aBME>i;fzY-26LIGMN{#gjti>rZF+4F9ehtk|9pD*++CD<
zP%xvK4WfTa>IKyrRJSUHCuZXd49?i_tXuZIAl2X{D;|s8A!}5`9J`A@X>fbIQ+`L?
zM-odon(-Ch%#>s;YX_gbk15iBna<LnlPlP6j`@+;{j%yVyDPDeG|O97RZ|sj+)a;n
zJAi%>Y4KV1blr5!&t`IqiPzB2C&u10cQ#_tb%Y2+j(-1+5&rFLe}WI5&h1lSmk|?{
zyQ|?kud;9`Xv0QBT*#Urz}5-?FUw!$v7g|1o%P67sP3t%{hz&ZZAH&-&3G2%n~KRh
zv*J~Pz-KFE-l8Mt;|5>QidZ@01GY>RfJy}s3!5L-sxT3NPgAj|WjN#3jrsDTd#;&!
zWR7&udwP0nw<_D2Wk;4baOGam0>=LcOi-=uGFDH`y!I)d$(=acRjH?+FU77e?H-mY
zx103seKiPTYl<yP4n#!C6O0R-Od~LypC4hr*)odQMZgh2b=TFHpkQgHi+5p<OJ?kN
z?wxh-#}Zwn2d}JZ`LO48#|Y_0_Arm!pgJK7QQ~CP=uzZf^58O%#je>qBIyMkOMH3F
zpb_gYRoTRQSI$x~bi03*U~MkT9B_p-4)TO!Dj*4#VLF^?5r6q*IMG9VJZ}^C(qnfI
zuH=eqd-y<TNXyZ-+k+7RbW9eGhm$AX64X#Ri@qvTQ;k{8(x7S_@b}?iD(bAn-nlN3
z%<pksD}T#giw{wIElM<i1}H?;Xe|X*RoW>+l~#`=0E{WoCN8lK(q;$E3OX{^9I84m
z+UI+)*{NM3QnJGC(?ey=Ndk$)%>k=)2Qz%`3=A_M_&l|F#bH;AwI#(Z`HjJb!`JjF
zHycINOE13`s1|(RiF1J^X^w+{Mcnc--VnC7q=(;r^3q4|o|@k=%UXk!h8*EnpBqDs
zWH$0lCV*2g%PuiliGV}8;srmT&-E(ylk-xLrXvZDu&(P{pXo=qpNjj?Q*~T3;_FcH
zy{7XAj&0<&n;3GEVKNax5To6ape3+xyDXdI+FnBJnn45J)&NYz9l(qM`%q8NlHz+X
zy`M8J^S7RJw2OJ9?R?&K-|N({Qx7jvWWJUw|LUP7?BwuCNg3;;!Nfg9#n&g*ZH_g_
zeij+nI2P}ASKc(^myTsh#&$<cPJu0tgBpxDV;Q;)>9)u{P4!P&@LDZaiU*~WeVGVN
zmwzH1Fy2Ka3Brw4MaXJ;NLb<Hv@eCN-TmFJW_i7bdw)q+j&kY~3;M9q!D9KrUwCWW
zVNp(v*O{uW#l_kTYP^OIy|VYU*|d(Q=H*{H7%eQfc|*$5>YL)0s=7;90KQTosyn(3
z0|1#XrjvFUHwF`<9N)dZtEBe5G$k%BbjvjH4ZzwJRY34M$+7pK-v%`Xgl4%t%_Bk=
zBM)1<7ANeW8(gjGIfOm5@r?wJq=>c|De#Sj-$vatTq{0*B#wQ5#Yy(CHg))ug;{#Q
z|DcXr))<_a<2qQ{w()5GJ-gBqCv;qlt)A$HCB2>D*LS)<iTwjy&(y)ch4O7)z9iN)
zF9`~+jrPpFQ|H{~GtskTP1SiD?akK+_G^H5tq&cY5P~Yi9<tL5<aro85f$H2!Z>tV
zec)+>2<fl-UQ=5ulvEn-6vYMJb`&lX?<?Wb9v<?8Rfwz%BgO|1?p+5ydBN!SOsefG
zLMUpf?xSg~OUSBsENS;M6`zXU<&TDJT`zq$wB9!|?b`k0f&X~+>ZhCj(>21c6Bh(}
zut0zsx*ci&T~r>R^yCVvb*)m5r@f19Di<g#2{l4LgY9@BiST1V<>)5+A$n{V5SnS1
z;7*qCFk)xQ1dzdQU0NM-kMcmM<;!oBpy;`l|9J?=Es`&>+qw!2TOPX%rZL&lUFx3|
z&<o=<t_AKk2uU2Wy4mAHs#Z)=I`iVV<`!a(=L4aS5sV{XztZqz6wOBK@~r9@2qTvB
zuoFxagX$b$$V2dI8qlO*0&_uz_pJZT$S3#fnfW3cH#`Kf<K|**OIZ>1&#`dicS{hh
zS+XCXX}2&48q-=Jp7L1nyfCLPyV|l8&6Yur15nPDM8FC;iylz=fI?zWjEjIn&IW?n
z(C+sAj8;K2NDoaH{tC{RMJ#hWL|sQ>;TQVTV!>WKiU1b*<EoY{vtW>kY)M7lq2OJ_
zIW81&<WSVM{L68uCfj+hr6P*kT7abiKb}G?<xaq?+&TTkT%l*j>)213^Irt-6IS>P
zzIjhDwU-AG0mq?kbk-py)@RK}%c=bfyb9hN8G8<z)dd{MAlw8=zQJ#7bvw(#8)uYe
zK=$Fs7pI|lTZP1DrS@7GF1(MKAdSHzHJTW6uZv~Gjw#>f-P4!qlayW;9+SVX0skdG
ztL?dvgRsdqbx;ib^)>;bT$serWIuYLL-!j1t=VFYjM-<GcfbYo{@_i?a)qZieBLS_
zy1cM3a5?G1UW=R(6$!Bq5t@&g6XxP^JN{a`&ELB|zX$stYs?0jX0E^W1F1X%vkHN0
zD8$Y|G~mat$h;tgiy`FM)<Uao26>w=4;XAEh3jEckz2C8T!B0B_72Cg>=aGBOG{C}
zwnW}7mxNk~IN*I$9zr43#F40|>kfr(C9JpL`0gR|lRkHrZ-b~BLUUXyoO7VVDT$`K
z!r#D=&PTHnFm2*u2fwcp`V@#Lo`7Az0e(pz1z))*G@}^{U%f{SWXpW7BeGN;g3MGY
z@`tRH=3-AsheQ`OsGOM9n~250)}|nMIS2i?TIY0)xkN~n=}gl6+XN^0$Hhn)8`Sc?
z%Anp%u*{lhWk^i$U?(E-4&;elP(Lo_&uM`toO*|NKm+Ui!pPHDvI2>A2ZU2MIVX34
zJ=w?%kC+0b@_Ek>B=3jSpgs0>9(}s#yX1L~kN$DBQE)t?O{fo9$vDGzkPpE65m{oQ
zRIPDKdM4H38O@q;9_Gm+2!EDPP`lPUieJ-dpX&VP$dQaK12?zy$w^hJFAwxW2eOa0
znCs^7{6mF|_y3tf1|e1>SIc-+PR}8W)<%u6O2sveI#RE>mb3%1H$gNd6v`W_1z#xl
z+g4*9rWq{%px6c4U+iL3ec?nW|FDu$q_5jsgId1AtyxOx_?Z^_bkE6{Z5vN`_GrdX
zKWMHJs$$&eI_RwV+Gl*wDJ3RETx|KWF1|P5FltqhX}6(DfLwHE7Yxmy{U$}nT`@N0
zp#O#ORy`vn&5~jeg^03ta)J};2hjoJa+!8t<8>yI^&Tc!CqMcaz41uGWf1)0i@z@O
zIyt#QoiSQL;4_h6ZMoxBt`LOX{mymR1nbpts^_l3X1B7218<#fHRG|Ql<a*kqkI{B
zQ-<aIIJsi_QTj9@+Cp&gQu@OejiXD-w08zt^anICZ%k$y@}ysI7gq+ncJnGgzgC3i
zx9LHq2Nq&8JO#(4Yh72JK1#vNi0Jyw(m}M)g2lWQ;$VCo^uTn_F`+>;+u(5LbjaU*
z3(mkYmx0azAkngGLLURG&LUkHn%~6udio*MDjbR~Sxf4^HBcA%6?ZhVFK8>O;7h-3
zJ24<S@<f6Bu~$~_@26<j$*we**gd;!L9yImc&PjO@PATz7IoEs*WVmnHF=8`3<8Wo
zfSraAzV);`#%oZqqY)ipb=oqD^3<q;L@UmUaH3PM>5NOVOZ5%y$I5p3c3V^emfJil
zyDOFX&Bt7<^(i=&_J578Y5~H)^e>1fZtWbGx@*uq<h2+yd$y1`a-rCRt`ju&Pa~fe
za9stGf_Jv!>>j$p9+*qMC-tP13yQ<a$8I#F%m1>uVxOfrD-=w3lO_S$F@sw+9i(HJ
z7z{V-C_R<pll<g-7A#tuydocY2fDUNaq0RAg@88J0y*ko)UPd9-n^MF9;x=sl-7Hi
zSi)-fSu1!zJhSjdiSR%0fj|0k0M_-RkzBw^c>m^WssEe>vaz=tPm_y8+<(rzW&hDr
z1ROFy+Oh>Kr`~T~97w19Xebx7%Gqka`nTFYW%1wt=7c)VS0oEK6N$fhhzop4osEQo
z*zq>z888^6T<S3X6$s(##(*0Nh2y2^RaG9q1(av)c#1R@wB<kjGG)oPK2F6;JmBfU
zJz?69QB5z=PkYvHeePZHI#v;0Lc;=C%FBF3Sb=Xwcr|+rE=_Mk?|ANI!gL;ar!1TY
z<sVq$ci~%)T290NB-Fn3e>~1g`5UZt|E|}FpB(2=+j}o?@eo>Aw2R|VMvZDOwCn%s
zH8dcL-})V~3(0`G0A<Vsn}}(1fwT!U{|QzKa}Vf;0IhsvrYk9<^y3q{Ow|#9x>T$r
zMNNRb)x&m((x)XEG{I0}7r#J(fUNR%JI*>dmt_K9N^>e0ztuxN*_8BdpZ+eX0}U~)
zEBSF5I<lRNRJ%s8{k2E$nx+bfE3Ju5-;<J<r+8|ggU6H-ko<j|_KeRyoe<455a=b2
zJO0C^&gp_VE?x#GO`Cw&{QC%084FwH3(au+1Cli&u28!UWq*m)3O;R50Bszzvi@Y&
zYLMTc>q0+X(PL{smD?Ub%`qhQEQXef2#+sAa6WTf6X4@v3bkv$S)rP=Smpqy!hIfK
zFp7ZlU$Ke|=szqMx`=7*WGVIVuwYOeC8q;sO*B>E9eTKyjIgy}yRl=7_w|>XxVT@7
z=j$&d^!F!^7#^gHT*v{V^N1PzIwzX;%fw3T6i%DexzQgLiCXRNEb+W;>>N{?JK0Qj
zs`Qw}H;10F&)JXj%Q6|j0r?9}mro6s4Z6dB&Z#AcFO@5VE8~&5xBaGf(WOdxd4+9b
z15fj8HKl~z%u$uun|LVdGY>%WZ{Dc-MD>}31+MqVrv4dz^S9D5Kl>V_XR;df2oQ~J
zK%s*V%{J_HoPLRE61svO!UB$;?xnGFT)%*tla`|Zvd&LFG)nD})NjK{xc(3PUNYkc
zQl-Gu+}?$MM_?#_fo3&djv~AO1)%njeGeK-3~~d+F*L_eU=sfRP%W_~v)g`4r18wP
z{r}|DSR$)7-Pr;X5;ye|)o*A{lccbPMIbFFC%WQl;TA-Kui^A(bFAfnVlb*rPt4Bc
z#6O*g{2PhNznV8fBZ6hgl0Y21;J}?0E2iw>80si?3lBNjr)bqPp0fqJUX7{Gp32=#
z7S7QJ?$+~-KVs(o+5ayrg$R(&o8We?Hwi`qov|9JywH(T+0z<=u9bM!q}mXp)721l
zak}r2+1_nV7v7tm16-HdEgZ<)&&uWBdf&n_`-XhcBf=iwy@<x1x*ei`BL%l2T>Z)Y
z8O_<4d)cIi8@_CxRwQNYrV^@~2A=}_p3=YSm;O6H|DS3CZvUMMF*E?NMBETQVdh$-
z-Nb;MNupeXL|BH@CZz%YDev-iaLAyQijOFB$6MEdrY8fpYhA7yEv>f4mAkDmvmFVI
z5-ofM=>m#UntwA36d|}+1-6KRCBu;=e|C-wJ;%j`)&TXCn9k`ub4TN{=y+6<!Mqn+
zINTjPhv&GO=eS-8HG;!8=+EBRVrg6!u>qavH%5fF4kHl4g+G)z$0eJ}VLt$PE5aPt
zMbsTod#N3rJ(zO+e4wEH&)$jL<X5>Y#EbNtL>VQ_0F3gI8E*p1H};2>h$APW42Sv*
z?ZDtpcc`Kh4a@;;+2ngP`~mgD6N8zZVrLtR&!o;|ARv0boC^&DCCD?B4be-{KP*E^
zPgEODI5lKBWi$W5&;7U(WUXu0tVwN%{d8~MgMXg47IMckHQA+VahqAbi`)E{a`R7}
z=3LkiM8o&l<&ZbjbFTs_0HY_c1C9|d?EGl8W8NmBG@;8!;&j#g+5y2!Lh_bG%b0kl
z^#@n3S+-`oo8O2<)CS*8|E@d==kE~TfDtYpEKvhc9N7n5p4E&Hiqr&sS^Tdq+#D_E
zqq|>RH@K@kwP#0<G{4!9=!M7XE6m&tw+oGY11QJ_F9lGzzWjtGno&hHqsP8#mQaa`
zOmJ*p+pbtzc7U>LgQ<^z_s0-Q+(n^19SQUF)S?pmFI~APUH=B@`oC){+9ge-IX<9X
zaA0d22IfhZuDhe9WV0(-fc|1dsJTiA-zEkTnEzkRBMfkH+*!bnb#O&05)|~|3HoSl
z8<-(5Om`SCB1i<~QjPQ1OY8|Ke3raVtXXx4T)<^!W`^lX{J#|iaw5_HWF+|hcj-j_
zJJtK*-}>uAbbexQmQ9j>2HQql4rH|l^m!_iJZ+gG{%E$O>$g89y>IVDnn(;A<IQti
z8L-;2=?lLVD~0|S#(w|A1OL^DLyMpNmk!eZ#=ib15AUDGHVxbME~5$(nLF_H7%|Ya
zeZd|i^lA~|j%j``WvYnJ?gr;`s$DwQI$O>GCJ_KbTY-Rt-%beNcEKExWCegj@$?h%
zA_*W`I4O|1sRAI#aGbTj!o7f11?}nXk%z@ZG&}jIE7d5W`ePc*I>0U(!33qQmi@E{
zL~x7*kDqTI4*{}v8HnIXd_P{(HWFRnQR$*ojjXY1UH9JZtM2HC+qI(-&e767L-Bi0
zrC1zEy>tm93srC8Gy!SNuH6fYcRO;S$;yjpQ}EBHAma+nZosbwclrV3dgW6QSs{ky
z99K#vRfr|u4bgizi08<oARrCLOz(UMYIm1Gussxpq6?r|s~eF>do;WkOJF^HAoLCd
z!za;zH9CrL1x*Wd2f#>xkqocmbmDw|1cUW>0x+xqk}-Nf3WC=ruvce6v<IE&nWL60
zQz7tN+F)IAAw*0D#J?MX0zuEa8T9-+CN>Mf;V6U`EN8X>#ZuF2Mx5*b`%@KoBmvUr
zFtBXciV`sKo{x(S7MXWDu|M83cB%?l8v(Xh8fxJKzxh^yY{VV0#XVfy!#@kf;yGND
z<wbqCXj2w#${(GCMSZxa4;S^}qCQ;Ihl~1fG1gm*mlxyZ#hk-`Pa-Y)hl~E<qJOyP
zA1?Zbi~ixFf4Jx$F8YUy{^6p3IKLRVSQ}leFE7^U7i;v3@$zE4ycjPp#><QG@?yNa
z7%%^1yd00FyhMY}o^FR`#!4{lE0Y#mlZdHCMWH0eB`eG2X`SI;2ihmx6YS!SX>WDb
z`9q6+VN=(Xa;2K_;P(i|3Rr0X>ds`y!P}U|aNom@7s8a6LZ<D;&!tT|-8{r}mMev(
zn6I_^qW%gxPJosCSfWY{DLRq0k<blMSHx1u!W*8OXWOW}%{<R_G<<Shc_NkinlfX4
zX3GdB<My9g)tCHfDC|Gm4!roe{~G1@d)$jZF~<HLpKbA({`Ipv_Z!pI;$Hsid-;3Z
I!@2JN2WaqnwEzGB

literal 0
HcmV?d00001

diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/usage/disagg_prefill/overview.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f029b4c05c8080ca98dff682b91a865b91e2adc3
GIT binary patch
literal 177439
zcmeFZ2Ut^WlQ$lE6{L3}y$M(-N{fm#5fP-fsEB|xQ7LLn5b0GxKtTu{6(J%Wk<g<e
zBE2agL<E&U0um3T@EzZM%d^+(?zR8@cK_e*bv;K<ayU6T_srZgbI<%{PIjht2!c$D
z=)j8*h@Blo69R#7LpWK4A#C6g3-}MhA`RjA%P|Dv!XoqcV;>gvzh1)%fv}4F<NCxv
z2>V~J2k-fp?e1Uy{+D+y@Qn*R!@lppf&KSb{^gj3l?_t-xAXh=?JH*a#~q7V|8_?X
z<zlwK9kU-Q=ltt6e;MS@mp>)&rv(0#z@HNMQv!cV;7<wsDS<yF@TUa+l)#@7_)`LZ
zO5op?z|Jfr4#M)6jg<vV!&zBb+1S~@jh&PI*T%`k`In9BuN(JYHlAM_-`_WuU*BS5
z;{gBjadUG2{rrFUVCNH<*Pq$xga~o7AL3AEV^M^#3bC*WvF!A+l!71x$G=ecH#R^z
z**Q45xOsT_c0*WM*w|Rv**G}ZLEwSq)-Nc5U5G<iVgFH15r=bJis7OMZr>~6RytPK
zCFVFxRz7(CN-PhrxP+vXw2G>lx`w8nzJcMP!$#)EEiA39ZBCqVa&~cbbNBGQ;CInK
zAn?-FYY~xA(bsR>iHlE2OiE74%D$hIoA=;herZ{GMP=3FCr@A0H#9aix3so(_w@Gl
z<6ggcJ2E;pJ~25pJwsR^eqQ{twEXoOWqsqvCiN$<wGCn$|H2OV`Y#;)fgd4|A69mD
zHg>LG{IIY_fjgTJJBPx4PT`{tT<5|?6c60y7Cm;aq^^rc>7XN7?EIBsUU6kTf(qpq
zOMl_$KgLk(e~P2OGxT?Uc4i>_Y%Jh}u?az75auiFeD-S?VFF%n&&a}X?m)hlPIT-*
zrnKf^fbtr>bbuPT19{;*k7ww4OyYk)24mA>(H4+{XUoo34Ikm3QlHZJahlD5*MRqy
z=>NrspQ^vwrd_Kjjp0@QU$CEa96qvZrsIAAhgk~bVwYu?eQAu;bBE_=mWw%qMT14R
z)maKSWd0|A2zw{U6m*i@fy7ouvkPvkc(^hPH&d8LzHc*jAa};Giy9p)zvjIH^%d`i
z`OF&w+xfukK+R`k*Hd+;tC(+G@1TAR2f<O3olvq3e#;_nWpoE}p}GsZ9ngruE<;LF
z)c^B)>zWweJFrN$9s43At4GT35EuJR=?yK<HF>XDt1J4)S&zHmvf3})(wGtqEp7KD
z$(!szil3tJIXe(uvti=UHg6SU3r;TWylE;%ga%NS7#wGqbPV7?&6G!daE7hD-GRi5
zvrR8!BxzQeuw_m3&qY*sW<&G_Gxo_21nEjwe=a{{5sjNMxxNEA>a+vFYB5D`mEq7o
z#O!J7Sb{|i^@djeABHOXS2XP9svq{*|Hr}pNW@vwq7=%%8te~5q_V91a83KeP*(qn
z1~)<fKN<diGQ&@$b8CpNjAlt(XDqjJ!aqO$4mg~XC;fX$lJKYc{b_H1u<d_k^Z)k_
zMnUCgrc(YPRHMk!5WUU^AMf^uu~NDJeGhf;Pv`fixBpj5%OBR9|If>t4|%R`Aq+TC
zHoSSWD_o{?<gDeB*LwtFQ@J<u)l2UVZLt3~s=#9KfT<_bv0xrb`+ErTm~ChH|F2Mk
zEt@h)Pu@;sgzNVQ!q@p4cObPIU@VccLXX&iC?ZRCAYOLi`L6U>66$*jYFQC|M0eOi
zG)BEN1@d3DYoRHI^gCRPYxYat99NmDJCM6)K{Ar*%SWz>I#@vdtM}6_JCMu)6d<u)
zy?KntglX?U=HYKKA6P6zPyd(h?lUm@eM#8&;T_0>8tM1o+vno09f(gRy-$zp`7bf<
zK=#@HvjSBA+cD_K|0c!I+F+xuFj_kdk%MSirZDn7x_+tWl_73U4X8JGvg^@}qK0d3
zdyKs6+7Pd`Lyx`-0A9<+7*Bp>piI$Z7eV=GEFqCDno2dJEX5#{ZV?B4PragedIm@b
zf0+H`^HA?e**V!f{yN><v<t>bTc>Zo*DP%mF=gVIysk70gdp&QAjpd9GB@MJ^vLny
zZWMpGh878ETZm3P>FsC{89PVxd(~bi)eYU7G!>!7ate0w<bgZsRkd@446Q8&@QJ(w
zp{{@lLPa!&fa?)VMRR@zWXPCqq-H%OwE22Xkc&s<lcy-ZHF0Y02e+5k<^3Pb&UnAP
z%&oi&(~AUWn|E_<2FB#;gLP`3=`*TLoPTw7eoVql`T5AI{Pd==wZ5g|N9#0cx%KPk
z7BhvfD-hYa^j6}pZX52mhQ8H|y5XDeu8=RUat!T|!n_8%go<yHz&Bt!!vIB8=j@<u
zCzf01+Agc!ImOhIZ7mYb`4<axSAT52%C)?2v9~Y(>@zo=4-oI-^YjaV1i6RFJxjAk
zumZPf3bs!GR3}=5?in<%D;{kba%|z}$%p#8m71~1q0%lXHv{2=A&Ea@L|M$Gy~<h{
z%6eD!aE!&=2tT#1#w2_#Dy-hZf?~Nt_npMfXTlppCP^jp0cYIE{y0zb;!7RUwBzBX
z*wnoW_kPd-rMoeEa%?MzIuI4xbziY;$^esRlL99g(uD=S_fE(w<~ab91Uq-LeV3&B
z+B0=$8(;d<oXA=+Bx*#{r_kF-@6;^ri*E*=*P>1Yq^$^X+AMPzU|FzXm`o#k(f}Ds
z4PiV&3Vf@Pt0En)`D&C=rjm}!)ANxWJujVG{xZj0DJFI=;l%qANmgsj-17P7Bf-Ml
zv9mKX{bH6#H)7h^D>oq;9z7@NnT$5r{8Jm6403diAK>$y?^y`Zq`n<>qh!?T?!wTO
zW?V|19If#97`(5|zGER+Y(r_Px`p1VwJk-nMZojs1y{Zk(HT^6jc0~;e#kH`8Jp{7
z<-yL!SMYp^zg0pdUUtEGNi&bGqQZ0mwPun;8FM-xOH!c-zyyH;BF6yW+jChJT@AMo
zxJOv#zY$Cs3JNl@{qnKi=3YZB%WMYrIg^F)M3GSko5!z-ApBcfJk!t8^(Pu?g4kpt
z><*0B-YhbxWqWb`@jf#vk7l_BVLx;tjO<_!XiR1>*`ra^5->yc`=sSY_Q96Vet&LN
z0u1UjCe{_Y9DVWLlbg+UG3z+;ta5te8dd~5|5Tm@ynOzFG-!g@`HV!hq#pyG0V<i6
zW2(%dhZk>7tyIMNyM0-oQ@wIZ=ys<Z06?iyjOXB2iYMS?dvrW(7uMf?+Sg^$XQ(UR
zE~8+<8{TW=gD$sA64q<m*HF4xthA(gXK<E&xYm|h&&b#WK^ITvt`W*or@hp-x*#6~
zc>SfB!SN0v+a;six*(GUMc`t(8^Jv`mrYsVzB}*IkJfooM7#4mL#a=9cg(j)gUO3~
zqS!ZxuCtH$iHF{EjkG-=E!_`ImWufyg)syyb|A{x-DoNY$k&e|BPO`W1luP-Sxi)R
zz(ye;odVjb#{7_QbRHowNk^~6Oa|!a_xuUEfkf@|@o~?+bPDyZc)a&FZf>`4Urdx!
z(7HeLjUFp7vOguuS}ln)hxg8?k{Ep%)n$u>1;Xp`D>h__d9IaeVy5s%z;z)5TeU0%
z^z=+#4Qktg(3PH_cc1BB5I;D0d3iwgqS7m0#}pQbV7A^@aF+1XRYbuacnXw!5}yc{
zG&#JGOEHa4Ox51>7KAh3O*Uu|KhJfPhTZ8~vRJp3t*-ag2_Fq|kl4?-yw}md;E8dI
zq$q~J4)x3t3p$TWW?J;!6WAa@Kqs?zZTe+jhw9ifGg&+G!0}fU`J8iCwd=p-)}0d4
zX0Lm`-(2}c!Xs7wB;F<V$=X0Dzlp>Sgnbe;_)@wO*{Zv-z!HE{61yeLcv8B99{M%k
zasOcJJuSDf>b>q+-?@+QF{z*D+m(!^wbxeaUTh6fBuACT-sFx$Q<%rHu7s$bA*o9{
z5DC=$W4IjbQUN6bc4r6TKg3gYM=wBNv$?>v5Z0itAtu$db<a{M&ge0R>bV)L9Lj$S
zdtPu>i?9nzI!Z$Uz6(2$3^N^oh^!V&(lh1qp+OTD?qyCZ4A1om&ki&vS(<caaYcMo
zcMuyQc?*3cu)t1Swb0yi?|!_gz#+`%JSqm5q&Y<sncUjs0vuNO>=@mcVuZsn<wkn9
zH`}R`t_Qxh-Jh7qbG`dk*qj=dq2bSd?oBYFoR_i0$ON^#2+x7_CBa>Q)Z)2IOm*P)
zLP0<Y1)8G&{cD)6zJ!hZMFqF5%L=)!ei<sKS=b)&$_6Jq{CP{%!ExUop`lt-FMgdL
zx;YFhye2jT6flq8VYaDll(2koun>j(Za?!bbhowBFE%P&7cz0FPq2RWz+S6vZ2IX_
z)FKIBtMn5f;g$AM`(;r6CB8y~$dCNCd+(OSoUPuMEXp>h7l)jB7CO)(z=iDY5MacD
zM)1m<`hnb>*>3eXIua%T<%lMFk#AVZ4ZozF2pD}}(WR>1B()!{%P&z$gliyFTPI<G
zC}1zT7n(#{wU0xx02gPw4K$-_8uFQqO{<jdx)~ptbF$~gZ&>$?w4wloM0z$f15fp6
z57~iK44#_dU7E#`9du_!PBY?B7h6GNDU#3ZUB}fd&sa_QwHXAZyR2)SQG1}6$8gKO
z?2R_qe{Z4#-;_7hoPqGYk$|hTKx&}6x}fVBL1>yR9l8Ul3cAF2Jc|7c<zifgFP@lD
zKxZ@cL6t4G0a0Ad_sO6J6DQmE+*_WR%kww!(U5rN9w7thR5mpP9pz$m2vmGl6w1<n
z1XB!~FWZ4M^`pTrFc{gJ*cl@`38rv}<@=kqa=I4o%=g>zq|dQ>$zIB72DM*3gM*KU
z*Wtu~DB3NA00q;(2B>q96vD~5QWwFP1t|xx;p9cdferAUCa{|~uL}2Gw$5_veO}^N
zb-?$sRBaWt2_RLWx~w9qp-J-W$;xE4Zs?snxJYfj22VP|ej)X6@o30{G#>G}Ipc>c
zK74+j&`3;9fZfnC8`T<E3a6by4xj{fAOQn|`>(kpA_#+qJ~!D5NPL&VNF%<x@0Z`~
z4xN!Xza+<XVq4krbExy7_eN2eAq=2Q#?eItsW;h}$@m1&AI3`4@r)WXAur?zmh1`V
zp+hKwsdQTcmd}T-I;Oj6j1suy7?_2}g+1=ZUd(c1MkK%w{_Hk=_RYXQP~TuI>H{<R
zbAp?-aEblnK0f|;PMRVwE4knmD7`v5H`$&p9LD4=BkK~dE2%2Xbn+1K^Npr^HPJ?P
z$2<$o=6>3z%WEyW5Gw9@T~85IQn3DL_4bf%0;=8{O&SEfvj+oX20=tq1ZpD4)hAw&
znI(^-eI`~t-5Rwjn{s6$asvu4h!5-h3=oaaIw}I)L_7lH7Tdj!2e$&sn~P;f(+)g~
zm2iA^Q)m#i7uwXE$rOynQh`(|3$RoLP`}R`cEdRW4cWSxJayyxf~1gR%kc@Or#$U`
z3VWDediogA_etThR||0rdv=!^VG&2NC4!69#=Kx0Qy~CepP5eIWjY$jR2|(+*U{E;
z-cWkc8vAzjXK)=~YGBS?D}~GF_dYX7lWpUB@Y8+@OAI58(jt)`V8Tp;WxVhT9ZG&l
z$jj6h<aP%XhmJI~F=sNZorO&&D#8m2qH?YH9_)ko2%%DEJ9H6O866!Qs4iVIb^0wv
zX-K<10h@}HdOo(+@PoV(a<%GcnC#Q0w1F!dJj(mm=L=h5*REmStpR%kS8kE{j)tXS
z<$e_Ee9;w&hAr5q(hpX8-Rk(<n}6ZR@wzuWcl)&1gRf`AZ_84={K3E%g#w}AuKFE_
zWSIWp&#e!3Al(5Ki|1F4eOrGlWkQCqec*T2uZb&4T(%<NXko}<7&~nKK8hc8X(-X7
zqOmF+aS|a#r9Ih8$X8=d$IE{Y)~VE=v+3u0YqqNOQ*lw;I9cgbO0Qry_JJ;-u|_ZU
zoX?viV42xg@G*`hI?|R&LmB1OS=U&II;H(C*(?d)fk1P2AY|P|*KhGSI?{6o(uvK1
z7qh0Q|MUKc((W&;0e{?Qn32XJpICvELUoyLhSL<7)3Ao-6$|2Qr=FeJGQx>GuqNEL
zZ1m#xb$LT6`{|2Lk~2qiV~%Z|ZHPmbGbeT+GHbAly3>g`Sc667ef@_fa%EnZ>ZaN&
zKk3-D4CQ@^yA`t?7ktA7w=zt*f|_BYh62|qtKaRtRpGLLuN8%Fs=VXIOw*G<u{Uux
zjfP3|!ICDO75{**@b{P=ykImMhbLQOXL_g<AU4zzLs{33@_y0W%C21*e`$!bP*!33
zk(Kh<x17gTE_BBnKwZJkKZ37k;A5avyI-M9Wwl3$JI}!!V1O3z4aSQK!+<`%&PiT2
zV|#XCFyYlro#&Gm)t(#amrp9V2x`Q2gyl43D~7zk9G*h^CP|-m(J4q01Dt8_9mv~A
zlrM(H5)Ep*oofWGJ`j|*1FF42|M@WR6LgWwpo@LdL%V_q0P@HO`-*nMIW94IUZQDQ
z482dabyc(8gMQk*H%k%&GXi2?=+ON!ViI6N$K<V(SdpMq5$rqH1OY>)$gVRB+CLn8
zjC~5Mf4n?gtJIirahAXDVR<BPni5youCKrwny4PSo-_o83ZXJb$X|0&a`ZqjFzY)|
zIhGXYqgFI$e7#okN5%~GP{Gw*b=A!pElVQQEi9S{OKvx(`RzcSJs-ygpg8FVKYYyM
zqQc{HH%R<X1xCMIQa%5IKOs#UuJ~nLslc3OoD0)*Nv$2RTC|lFv13;7`F^-ql?5|C
z9bhF>j{6d{mCDG1`3c6u$?o<-2x~?oEMzMJE*9`SdccXi7K_kn%NfkgDGO{nYE3Lu
zYR)n+Jk{5HA~NgBiTT=(BYPPtj5{b$jA<x&*d^FZHX*aOdO5TqES`Dr#W2E@-(|Ra
zf|@mZu<d2>E{(m#t5>X-_Z`BF#l=S1EvGp}v86BrQS%Q)zQXE$rlP_IT~B78p~xln
znBLMYVUD!k96Yuvr*@*q&qi(LLhiA;IDY3xltLV4sj7<y=DP!#X{H7zf|E(gDWwNc
z{Bbbxu3L|1fR{GI2y+u@_DNl}>9XPU`dew9v+1`DsukF{UU?>Kpl2a+@pdZh6!k!d
z`!~~EH+U#Y)EA68Gf>&OU^|eDu$Y;Ss;2;S0gl6n;O0@ggCtzS{HlAZW_Yqz7Zo3A
zBrGf{Wqt|Xt*mmr3tEM$uc8Z!P!BS)nR`JEl7jiCdZT)7EXG~wL!O(|Rleh~k&|Z>
z=lJ^Te#hCdb54i2q{IyBTw(Ua*g))9N$?H+u?u3xZf8@N0kDhk>1Kj1T|9e%s}DN^
zr_LI7Xp!*=?S4<Ajk}5@Qq~u6r$(OXl`S%=#7TT_u3mPt9#s1T(=mwQ3a9O1et?T&
zeRXA!<LJ7{l+9F-nCPoi)NqK2-F$V8n<;7iYhKgFAl3Nc*ca#cvb19YvL$B^of2^H
zVx~PKDdz`l$Vd)AtYyz*Rk8DF&_-xKW>$hKI%;+p`9b=TM{}~y<|~iErGy!o<Le=+
z>kAID>dBpJY59OTlZOJ@Pb=U}^F<Dt@iSUs!BCOtvOcIJQ^liap&(4uBPc4@)eP#%
zsP{Zt`RodJsb@4*-6cKy#pBb(h*X*wb6oxq0}O2DXYjk}QNX#qfeW`_UgOX79<GWE
zy48k-jj-%@<gGuRHllv+d0R|p#;z;XZaS!QoHO%QUmi^9)*gBs1Uv?6$V93a8QN)#
zPKDO%+@^r5?Ik<8bCcTb;ZX$a?&Qr|m$*c1#omQMr!5-${o+q|iwSiG*UqEr@bPo%
z<L%*OTn|!zas_QSrk}3)miq0z^_)a=(Sqo8#r?0@YWr0#_b7?!7)bj^{LYJO8C&f@
zvOw(b25dvD?2mX^Weims1Wut}cy=IM)u^{;5NyB7t0#6KqaAhd*J3C*mVBd`USZe?
zqn>EMFpR&8(QlvT+8O=d1(5kt(Twa4O~g6ID=-U!&e(R_pDmIF&RkvK*(?b^s4}2a
zRb>~*9lI}0b}LI~<prdDV+3onToS+n&rhW(($7H2uBwby6NWK!cRjf`+Uo5Fngkj;
zG?ZqwjnVbbSp;*fhmR;t*b~bo9GL@|vprNLpqYXqs`Z4;9;7-<>X9TCaNW7BDeose
z1oR4nPIk=b1c%OvUH5(Mc>bPq)Bfdytq9*9_zN^?n0A&qf)-$c#pbSK3S{gZ#C~!t
zJ;;db4qexzEg3t}ovE|^ZS`1*@TZvl$DU5_xsWD@ZyY=GMJ)48Bja}aQQ8USI9#a1
zh&g9;F&z^f{-f1IWqub2lr;P`-*g0Ay5_8vlf`sHXW29*dy^nx_F&;THjvK2sDfS0
zjEREPTamP`R|455$>~4x!{YR1-18TLFuOG4n-aF<)psBxijZpP1ET{OPi+{xF&vCC
zCQBf`p{mCfI9CmzdwoQja=-8;>ze4I7zK!K&QFV5U9w5DWic#DEu{=r+&q*ngr!~{
z4~B(grb)L;w^F!z>|<{9W9RJ?fAXH_31igU(8Z8)ta}<UZ}_ZRq-b}1bznEKL1BW3
z5K7d_01?fN9Q^VgCKn`aI+&2E%^2|T?7Dib<MhLdbY;JF=d=`7xpyVe?=8_Q(=-Ew
zEF%jk@D<LAWc$ij79C|XH{AECBG<jbxI915D=*hwCNEOCi0cUCr$@;5%O_rMMNwrK
zg#<<>>;fur2T~tdu+N`dnHd%owby?!pPc?v*q`s^wRgC1Nb{LNWJt6wlV_B66mjqa
zDi|GGwEN}A1u_(8fSqWU%<?j_@vHtWeo^c7>Y0I%ds6%X88xcD@u$Y$57RO&wBj@K
zQ0H~;_-Hc<vA&r29S92oK^^&y6d?CMEQdE1Jk%cSE9aBh3Vj=&rxfetKhmDZ<V~g>
zq+9(0fX>0>k?)~A@^4|gfm|)ohLB@tIG!8)ynAZ)lf&yRrQmb+hs1s^ioze0J}jEB
zmCi*6b>5Ij*wn@bj5d6?b&r&-1<pGKLr-QL!n_MhjRNC4w?e@_vwh^M+wCgk`UTfJ
zjc=;Fg+oq#`+g#~e<Zwd54Q0_zVZFk*hSffR?|byj0>f4D_jH}FimH_MiGny10=Kn
z%5R7RSiH6aNyhUuZc@+*?I)ktPtAU>d%wZWe18hrOK5L8WlaM@Pc!Vuw*9|G)fI{!
zGXKzF7Oh1<Cp5@-KKAq)Zy@<>Z!fG4AKeh{pL}a!^N6_1IL_j|SOy0&-@&5Kcf+_7
z`ogL&Y!_%F9%g%(vmV(~f}9|v#4IG-RCa%WztuFcAXS*JUEreq_(;gs5Sf?{#@h4c
zuoswg)Fo_;fjE>aw0N{b5rFm@*{8NEMP2)%bzxlZ$@sXMA4l8)OK*6X(&C}u5E#nA
zq8rzVC);DD)qEHaQ5W42Je0d%x50Ge+>=Wk3Y46=2k@228<q9O0|vOu84g0Z`ddzr
zmeZVC^Fy~me2Oj>#N-(w$Yf48Q(Z?)_$Z!)_=->Oio~<M#zOVHZ7rMHuB&+7%yGU}
zB+0{FX>eelQk604rg|yCciomsVcbK3NY1PwRe|i^H5mD`2TrcjTLY6}uZh}_&n=B3
zKBk(V+Xp$97w?N_1`6fC42~RvOnwjiZXxQm7cA;1#EUu{;rQGB$HLF|;y=J5+yb$`
z3+I2G>)-?ZLnd;-M~6|3y+|a4#h>Z&Cl2ukdAKIiwXQ~J9Ssw&F8^3QoLngO;P_i9
zc)*a-SNIKwV;=<Xz~&#}Uo54ASQ~nJ`_@6C9#sM_Y~qi(G^b5Tr#@JvW`CP$9C&wJ
z^|<BwLK03=<_O2Bx&815(0bg;ESQp(Gtwe-ljX$$4#Ey3?gkihz=<`ezTEon+eMYl
z)$%phMoNul`|s6?gdXAIQc}g1X=-fn;ueNTRkZy`JiZBp07d+m0`y~bWXpRLq`gq!
zjjHJ%0|Rx%skgKoY?E+>d#~8HmZXKsgtyEK&4zV%Xo1{PQDK3(KI&w6=dvEU;YeQA
z(I-LL2RuVrEst38#l3MzyNEr%3{KF-Y$y5!t`{Z<qLCe{%!!$oQ#J%a2|%}pDfc4Y
z&-XgD@PKDt!MoJ67Ma0MMH7`bI{TuVd+>lt6j+r4!>WVCs!sg$%!2Kp>jLpxb9hJP
zw|g^<pXDuT@{OMOH8erLb$xWbDl{p!y=Wo(Zf|A|_8j_-ye{Jr))yKNgCO^qHsr6x
zwyTu~<lk{`&HeasY6i!jpyqe%P3x%<=PSm%^dLqJfl(<)mvHTB#=f{P2Zf8fBNWC~
z&E70rD4>}!hxN6>wd->6Kkpi6#j84>t+?#UZ_UgMTKtZmRwIC3iFErMEyT;~5|o_x
zwL_|fT)r>)wA-zOFV|Wk_ix>N_mb#k_%P<s?6Gs9iJfpZ>;>q|4un`m>MKRgVacbt
z+hK%hr2N+?BY3O{lu-Q!Q__(8<A4|5`=e25`?E9$@%`y7+UYT_Pnd6Db=W?9ESh{e
zbH<ihK`GCVqAS70J`baw6PpXmUpiDN4`>v@v|UWryjvn)|G<p3sej>vW<wi(u0+wS
z0XVf8OnSs<T%VS&Mz1a%f%5~YJwB>gLAMTij1qj61+qJ(N?PNS*e<Vs5@kIj1#@K%
z%qcP2=nmJPeup<nB(^IMv_xciDgu$R*w(St31{6Un+TI9vc_(~Ldqwsd&EPJK)oC+
ze(*taQDj>Xo0wc!Te&rFpIRgdH0l(p;L@3j(TROV;iF4|AKWgbRSKyte|Q!nSa@Vl
zq-wj;yenM<PV@(DB6H4|aoZ#eRo=I-CXy*q5R_{3IrojglbhP(kFWD<_Qb}Xw2b^r
zJcRjZ^ozsWum-gi8(IK!1Vy%lb0G)X<`PSm@#1tzJwrSFh~3)}ugR2kg2QFD$JdLU
zKk{kvTrqrlc5)SdTOTGJ<ik4>fl8JRmNrZZ4N=TKQ`dwh?eqNfBGcf|V)s@OrZD|>
z4o+-**rL13Lex?6H()5sAL_T`e3<~fc?WVErHTEATlt%N`Sp%eLUp-=zP1qCU{CU0
zSmUO9GVV6MGVGbtzGFFo5Wx&RIdRUW{c!DZ^ZQ!-&qijg-H9iQ&CYElXi>kU1WwsO
zc|nllMLIqSHD73^%t%FY^aKX(EfSEI81^lEXHb_^Vy(#Ex!=jz;e#-jFT4oL+76<S
z96taqGOP>5YGOzbjFp&)iq(@d?4jmKcvRLc4RAgTlWPc5?Q@zNkkoEGG6a&j4RVYl
z43Wxcd%;!YI8qTg2CrWt70?cUWpbkX;ru41PdtNMq)D3}jXW<I4OV-4sbI6x5B5MG
zDx}pbU4A&E*ip`SKz9JgzS;rwRLwFcJH!Fk3P57u(Wj5mF0?b~Gw#kjah7?oHq8q&
zP*aIRU$EAXLAOCEqlSQ9#y@C3vkNbQ@`cY_v5j5;vKR0|L8rX}4?gr6kvrj5^`g+N
z^!&n{+k{<<Rx4yc;~9FUipg_p0VRy=gModM7X+uDuJ#@!qx+e%lcB4d$-7!x8%Bk{
zldhswH4ME5_cG6U7Ylpd>qAZq+doXyRJ)EHvFk-3X5*2p2z^EtcD@3vA+4pMgs+87
z-5BaZngPxfC2w>u^#&?sZS~qLTJTcdmV3go;~CQzPmg<st;wqMrgS~rf!tCrFO0=~
zwxWw{P=x>zS)^|V(%{laa}M&bO7SFKtn+(nbG3YL?r=-)o6=XQug^X3&2aaSG>h_2
z7Ml$cE_>a<g4hj8;5o9>2P{2%G75iS*yv{fc<IoAXirCqc=grtOKlNxLtm`IBI_(P
z-#r&U*RjtV=SK+o>6n7(@?9ULjxZ8B4j^Z+^@%Ykf+EErhY$v?GXjlTFwsDxi5z}R
zSzoQEl2+e~;P=6PpBn2Q?h|`%Jrjc9qzKZ*WqNlYBJKL;XjVWNp(-AsRPhi+P-q{<
ztm&>bhJAX`N0rRixAogFc)3+&9L?5<EFL}q#;yw<$ak>jr4?Js?1CCN+Syme-SUTl
zn-qJ`?B3=M35wXqZRhvt)gP<2otpBv<qc!Mc)eZj6!UF7;*h^7IQ5dr^JxLk5{+8G
z_rhk)JsT)_z1YkPYN;wCJ$+Rkfs{UFgY8>+57?55ZQx6L#LpktR}Fu5-{Eo{U`W$L
zzK4_Dpwm!7Vcv=}x%mum{V^bTbmN)HVIs+{7VkXr<N47$gCjBPt7`E!*2Imt3y*j0
zIdTw7Q$cm9w;v^dcr+3VMr)rKj~n?kLg^M}wQH#IY$MNtg4?F$-;|vgpHC$#oN`Tn
znKX!87`A_K*uEbhM+Jj4Pi*|0@UR)nx0$dWs3@-dnGphM);<=t;QzeI$s&Kr*19({
za`*{BRp=q_>e-TftZ#`3_i3*e4Yw_+7BdxfZ+LCb-r#?9BR?y}F81i}+>A>k7)Ppb
z3U2>EebKV4Z$NFS??X+N?LhY1v8?DhSnPql6JZLv>_F~U0$j{DE;(R1;?{ZWT}>ty
zHNP3Qbs{VVwUQGE-;SEl+F~sh+!^~Hm7<;x=n%H~anq(_^1xcc!P{y4kEvB3EG}tX
zW$qr^fuP@D?a5{TI4nOXkn$0Xcl2RP>tUmGFz5RFA=|(A$wp>6Ud)UYIq1WOUeG0R
z5lMz!9%3z9YZ28`!6id}F86D{9O{0$m^{xvNf=53o%Fo!ZIn>N6S^9x7VCgJMU9&N
z>UNWy`Oy))?FZ2xj4vfOrMG-dLiYumKA<>DHkXR6ppG`UZsh2a_JfcC8^Sxef*jVJ
znE>amd9WzaJM&|E@Y|D3=^HjIN~2fCs71eezjq+thLa<(8F2oFyd6kUq>pG3!K&%2
z&TeMThv&{k!%8zftqchV`1>_F7<>V`f(%*L7#KMf#k4JBih&WFL?Saos}l-XfZD4v
zx*d;N6bunq`e9*#GRBfIARFzlAK^X_5IWc1fo$ny0<NIW`ari2VIU}iICz6Rt1<A-
z>Nf+TV5OQ`Tc5zbY!v9W!I-ivh0Mhi3fqCaxrXWnZyyWdox^?6APm9{u5zE-%u3)}
z=CkIl459xheZ!l)Af_unBR901lEREa5#ZqR5{+73Xa`+(90;DTdqaOm^kY}%Y3hen
z)Nv=*jhA3s%MOIe*Run$5}OyKE8;-_2So9HL-nm=*!9Gw#H-tirPxCJZ#e&FnSKLh
zdf&9c7(FZ|MqdS`_8Y!oza#rJhF(G{`N&Lqj9H5M=b0J)E!F=`(-k3G1cFwv)EV7E
zKiUzxaa;JXD*0Mc6~(hOU_1S<8e(UGI`jc?3hW9@Hj@eKc+~(LrWnn^wxSY3%+1@f
zr>~^%hPgwi)%U<D3S}B1SRi006PT=ICSe5|fXg~27NDY1LenJ}dO4)X2fl3Mg*%Yx
zm4ijn<xxrF)~_;`n^J%^swLwU?40gwy{%E=$`eY^Ns4!GP+N$>;f}$lo*57Gy9wvp
zlKf+Q_JB3<78qfd{T9982PmuVz6aM_mgovkb|A29B${CLkHaI9|Mn*l|0rPBZv4Y(
z)WZAbq&($R{3H%^_@!T-!l`|=<8HMGMRiM~H?|DFGDZJ!lyGj?6HQL2qBHHbQSs2f
zA3^?G&Le$6@ty`t;5ApNQUIKwtu{~1EjkF)Q_l3}`UsI>i5MM@SFa{YB6|>%0{h;C
zgg^(i(jX^ToeYj>O)+Fkv8)X{kOV0Cat-6@To7EEDe^V%z!hp!heTUjf-cq3T{ZcU
zpks=CaW$U&1U5e}xUCE416{yfum|z;)678mCD;ULJ^N`x7D#?ncR8pJF|g%V2whP3
zj}bxBdJLMD4d~AxsNov`Xs>bcIMCBLkd!`}HCt_E&fU-=`!dDU=(3;*TYM*=n8FfJ
zdX5!c(;hg)#!Q{;27QdO4s1&k9E-5o278Cx7>3`0{)QMB!4cKf3H{;IA+ZA?D1?KK
z$LXf^Pr;gDrti<96gucHe@6+ZE!j)~(8HB(l0j?b;mc&6D4VfrxfPnZ0d4=`I*<De
zEr1i6e3^?O@e5l5ti4d{&hjIsJ>RdP9x#?lv7Emn<+sA2-dlxwmeFx*reXl-^?yUj
zZ)DSud3pO2^b73zGNvo@pDFpBY)s2<CjL23|Dkyjj6;**^m+OX;4D}3ouyxD{txNZ
zfm-#qitTqw4^@^?iwfFhPOWGM9VQaA#bbIk!lMgzn8d`ChD@R86Yic?6Aksc_tba4
zkF~v7a2@IDV(fm|>6$hTTu8Yup@u%=s{rS(A-jmm<HL!+cPXO(Ew1Cg+co`p|9@Tr
zCjJ<T*Zd>))GBHi>CdJJ&j@MRw!Jsc!^viz2(Wqzc33B%9;ngf!Amvl$byWl8Zg#6
zwcUr><IMP_ZQt<g_wk_Xn${e_{#bkFy3TO~x-<U&AUuCUSlfcQ?La2m;oIELz;?IS
z=*Or@5sxByuo<2slW`NwVvAF5L8eMmG{B}X#3;=RISB_TKR0w+dWa-Lvk9sTFq0ZS
z%}Uy^ziPY#dD_&JgcuppkhGr6esk!wHG~=lr5`2hcB45j!Gw^iLE=d02fb+N1={H`
zRok0E(RtN2N_U@{RbmT<l3`>Q_^bqzS8#z36OSTW3G(-qMXT~i&pHAFL@oc!lQK5^
zdY<C)eng0w`-RgF=SypqG5jNDY|Ifn8RSC#E0}Ztq>IMNV7U=u^)0q*90*BQ^*kZW
zK0FXtYx4EMx!l>+Vj}O$IBbiDd}1l!YtcwvN+2nW{-8<b@&$PYOsJnrKjd`$w3t&Q
zgvF;;Lg&lRry)~IPkd&M(~E&KU`}v_QAjfJmFP#$vblhzeK?pi+}d1|JsCQm{GLZV
zA8FUveqDBtEA9zqB1G42RL@jrm!Nr5LVy2w<k65Y=|27w#)^A*%cdZjB(arakIvR`
z_~yKM)t#bVt^B)QlYi@+2IY<NNqJRGtMI-9DG>S^qY^fs3#~IG**sNCqk9s#+f9jl
zQ5!WTX9#&l28|(yzG`~Stf=6s6MkYI{eWkHvIa3%z0#$A=p{xil6~f?$#CgWB12&O
zfbH4y+Ln4gujJ|`#J<xGA0bv;ndm<Aq-|_O%ui$zEX>R%Qo*_*Qp}2yNcE=JVUrr3
zcgwUK**7=k<@-RV<`K`H98}x4gX32YW1dxSMKh{F%)3K#l&OU!O@b2qIB^c}ru?+`
z-gvRAO(Ny2kH?TpqnFb42i@->5r;lKQnK211}_@|U2_GXOC>*_=n$bFA65avDLtJx
z&r(pgH#z(Is>vLK)v~ur-=DlATm3=o)3|N4TiV{ouOf{V9BcxRwZj%-?*hd7qJJ`v
zB=11H7nl(<lVU{t7ONT~Ds}5a3@P9@kCJF@flzJi8)4Xd3HG@vY`OG1bo)pnLwItd
z5loJ<!6YF8*@>n)L}W7dcs!)<ecw!8Tt>bH$LdB*Il#e&xULhZ08|q6Mc9fiMTVLR
zxHM%hKzZFfKTd1YErZUldmrTyNO^5NwNxe$?juO~&g9jOBI{E313?slA-mK*m}JM^
z03-8gcz3SCoo_}RWr3o#U7SBNM#Qg-9DeQ0ay7q^b0WUK;L<X~qZBL8)MJiAn*>)<
z7eH@PIY1=g5;wg?7Q`RVNm4$|6;=-LPJcJ7OMGE4B9m3ug+;)O>dqH`UgO2uqPk9C
zFJXn5{qVZA<PN=P*7tI@eI0(4*@RghVxZ%9=dp`!XEO&ZE$^<s6LJ%uO*t%|%%wD}
zwE9i!Ho5i*COV36F|P&@(DA%!Jos9{^~UcYfMoGJ<BPh}r`H|6P)!J7D(q^VQPDv-
zFY+~t?1+iQf-!5RnIa(ky;ojxB$U3N7?jBueS>@RWngH;tXmDKB4d@i`OcnPn<D+)
zTPcVR$wx8w;YZcY3I#{<wcaA^Dfs!Q*oJgp9||h%hqlf@&m>RaQ@g8=XZv#vg&{6d
zZ^9R?J7a~&y;nFjtI|N@j6n+^!~!YM1;O-d+JZ;Nv~$--m`o;Lw9eUU8)5{VfiIl)
z4<j$!`{4Ne+0eOCFgS+LkC7$%;C!>uctS#>?}E1U%8RaBl^<2c+v5c$_>}$b=P0rJ
zr)=y%#F(R~djH!9l`!I(6yg|pBCll5YtTOo75;RlqG8hDoo~MS=K~?v6(h71HB`72
zV*14S4t`XBW@p^nAv0lO3QcWDx19KHlP}n4b&LNsb&&JZ+}RwPV+Geg3hei>nupG0
zQ!fJ53tLPmVjqx5!0i4$G3WoVDzL4haqxrC<<Ax}xeEI?c5}-uKX>~X!?UcRwbjfh
zM(zh&Nq(iT4Ih~UZ5R^UO)`<te48#K*R31Zeune@HYRDQ&8<Cqp9*rA`|gv}FZU34
zS!<UtbC@K!WCuTTNM35R7HJSYbdo$+emf*&wUKd5%e5`~K3hUg_PcCJ-G-<)eNH7o
z5}F}yo8Ppu8Yw%F7)-m+4usDD=QUbM9-0PM51Nnh;{tn{@+J2r+L17=x6Y0pcC$y=
zvu}G{25Dr&rv4C2Bm<-7*<KXqx>v)`l(4s#uuXi}!Dfpq7t%2Ap0)C`Xx$Z1?R77D
zWq|-!7LHwv1`}f%4|Bjr9Uxtq@m_sdF=cnPY&k!E+wINMA%RcB=8xVeobJ3Qyq`_h
zHJ3RBZ8VHG0nr8aC9v_plX{slR~S*dY$lquK`N3OFKtDiY|Awr{$lIScZ_cEj-(cd
zGj}TflyW-sQ|9OitqmO2kx|>R>nrR<b7#6S5!+iRKZ}#sA*;o&$!q&N#Ybq-UVbv0
z&UbZ(zw^&`egB2FV($~QiV=+-=$hk-omuLid6khWM;ePXSVM)&JsphDLA3ZDIFOJ-
zj_k+o$FZfHGWc*xJK)-V*L*;YsM`%o_EDWQksyEX-S&A^4P11(<5EAmN^H%4>wd6N
z)$NV?DekW2qdvvA2LOL+5hK4nkhp~ujW%A8NN9wMbiYq8pZ?Y~>a&(4{7u1zFDpiF
zp1)Qg{LYDYUYNR7Cs!FF7R=$z#DQCMcM1$XiZHCOJl6Q7I!Y|3#W#lQ;X%dESrDmb
zsCcwghY%v1jHjzYIZf;<fa(*U77J=LzZpLs$C2yO3TrY6!O%;4#Z5I+DA*KegX;=|
ze2h9^2MiI>g20I-N2$g{x3QWPA3v_p&+dgbN20aUiz)eZaCMFiM)(2A6j!kO<XHn;
z9LWFKYowdd9_1HQY!QBK(qO{HaYo9(Beu^`l@OE38?85Xtz}HE7WvA-LQEN*USymA
zsu>qAo$HZeVpx2UF30l&$3A+@`+vAV9MxwCO>XoMhqm0z=XDubcuCY(iyD;q_^ds_
z3VVmie+*n=jlh?G?$G?to@-~WF;6(78P~x?xrFxz?*<LKrQw3wTvTJmX%w5Xrl^;5
zZN;K~6Yjw$*M1irI<f3b{11-;S-{Ln3~U@S0v}16^a6S)VXn;|O!gD1H6nJ|pKc3z
zK0ABlW#OzWnG3>d(+SDqKxv`s!HC%;0yN=RMo9x6&}b$_RoeTLu|0;u&^tw}9bQLe
zd8edFbgRXzFSklc9C?d*EsxdQz|lnqdh^g^C+O^eEvP$g(UD)ykP`{#r=u|oA9LS1
z>&r|idLKOQFH4MXc_2ubeH|gmxKdYFH?nee<VTBAV`F2wLad~da=3*gZ&J!WG3O(^
z#b;S=`_y{JW7kxg&GTQ`-hA9RnQbAKybu^cL+!eB5O5T28TV~?_^jk^yp7H=sD@kJ
zX%+-q7DTS^roRF*#N58AHR0`-XktE5QGGhbf961BMipCjMr^I?dLHO%2#g}sd;!>Y
zztm&<NtfCch_IlY@S4LzMKMX3+Gm?Infrwk`t?@#4Ep%b8L+zunYr>p#2AooNJYQ_
z{63Wd1{sY^6(G_16&V||;$qM-_Staz{+pK<_OtcaY8~9`pz9c!HgWn%#NdYzsQWPd
zK2i>pO<pwZ1O*l!3JX}fjZ_A_=TWkk_I)+dP1ZL(_Wrv>l)L!XTYg+?&yK%iEwgqL
zwmhWR;r**;CMawM1!BH*F(@H1rk#sC)SKUzo||1;V5f=mY;J0#9FSa4owxKlo*MWF
zRh~|Ju{%nsTC`TSYHbtohN4An0PM+kh)Z{{{+OiF9mvTslPLZwtFgr<C7b+JgU%N;
z*H03sSo~6<QDLxsm_Xn<20-`uaEu^qiFUm9{Ff#t-sn$c>rp=0_$#3XRC9Gdz0|$Z
z9{DUY1<P$FMGqqvldbx0N#i|7#lOsLDm+3}w>E?;AL#Gl2b-x~9J#-aG(<JU+^`c<
z-Y->QeYAf;0_H(KM0TZ%hxq`5WF5f_Y<*QHR6Kg2yI<-Hg}bpWf&1FM(6u)G9HQ!K
zn2Jl7j%15#*Pax|N4_n}NWw5YJqk~DrJ-aHds2x*D`jN=KD4NbO5laC##bja_KeJr
zx+vpKw7zVmU%d+HUnV$K0bX}4ZuM0lCz+g-u$8<j<RqTI!7iw8#sE`k{&+etU%RAz
zZ1EufUPbQpsA_s<+R7eEpNgaI*}Bafu`TBkRF^eV3Anm28E3*pb42h>GDQRA&9HWA
zM<+LTof<tIzsUnMH@78Bg*Iee+!FGB^U*a;HnU=LXa;|RoJlGuLE?(^KU^GDp+u%%
zeHwk!E3nz)+rjCxLl>Sb*1j3h9wFXjO}ZgmoD$9lg*(8NnS-E)<z~`OMXxRBB+iEj
z2hb1C@A9Xy=r0v|$T>2_2H(#1iYREmarW1V+TSTFdn?2#|0cIOrUwORPm%;Flk?af
zLDoPhpNVNrMQ#~Ud-u@7E+t1>)Aw1{mJjQcZ64Of{$#u#?}@~;fn|gcDW*7bP+pNy
zg$jnU!4WF;A5<@dXnvvMonAgVxc72W$H2Nok&6&c{%X-FE^BHEDN-)8`TKxL1f`i^
z-vd4=>f2dlI#DvJkCOWIDkC{a(R%QDxU`hCo_bnw97o6bGFGfU2`qU$MQW3s+H_OF
zPEx^`Ic?w^d84;ke4;HXB`h=q<ERmezW|#Nf4<-;dUg0?GrMfao(WgFsQW#TMAS43
zQQ<x?>98jw4&`raGv+e-dN1;|16*zjIoal)iV6zCPMZ^=5|=3aMKUjJv0~qojk<hG
zP39kHKM?gbeOG+pcwDfLTMA_;1x`K<mGY#!QgpjArws|5jg$fxvaN+gQfRO~pPiar
zU29Gh<AK-5)s0YRP7le^J$GMSYF&uMeKi%^`d$HM5SB>s=uE0MC6jcIFeu}2jm-VB
zc@l9rnxnG9_sP_Kk!icTZ(ry>h}-uz=d4xnR$f1v?1~o-K=Tn9r3gVXSHGDO;2Owo
zD2++!{osC7*kE{XuFHmQ;Z1g(lSt&0;zgG|aV#kjifxHnut0T85;g!kTSdTkhQ*n2
z00pIA4d)JEfs&h*fpbE`*ZPw~^^FUiU96o>ygQeEJecQ7-<2cmpQX|BU>}SjWvwrg
zsaOx-$<W>omho%a`@UT^-?f?WGVt;3&Zfq;hPztc;_nsI%^X_p-PLT(SrIpzD8*`9
z(M3@Ubs&t;vzLAr`jZz-u0B8APVH)AERhn)d}kUd%?T!c<SLq@f89;nY1j+>*I6Pm
z$tAg;KK5gNnrmH%=}8gIp4xls6t{`M_&SDUNAp3LFz$l+#jfboh020zv-pKw9&)O;
zeBO=?`c_B*Hkfj*pZmoX%!Se@II%5I%<PGPD*+peYQT|v$<5to=03Rv%;`N((AYH2
zd-dwRjOk)!OSZ077H%(26&pf#C&2iCjqdHlSd);d*>5vuHKEkzCRc0aglJ{S3lS!}
ztrllFD-T>rpST?)v;0+&D=oJCg3zewHzHClnhQk3dE41a=?BOscZHNi-{{|E)Gu+b
zB7j7?YbBj`#f{*^pY5F!um=PFJb=3WiaBvD^#(2x8>~A;Je}B$YJl~~4iN^O2z$mu
zEKla2DtVi%9m=Bl^1af<Lz+_6n~EtZ_lbwr-vk_bX<VucsM^y-4Ld=!-vK*~XNo7}
zRoN0UC4d`LZ@zvriI)p&<`X0{chbk*G8eBWCrSATNV!`r*PvAJjT3cGP+gj^5N$3d
zFP6MIjqAyj81g5;)3*Iy_#tt|NzWRer%nX9^Oh&Fzc^>qVYPXu{M(Gns|wGcHnDfE
zO@?*K)StT#Woewh%V}llRjsALVLhLbvvDXz;Qp4>@-eTsU*f}kOhIr-5j7c&EsSjZ
zV}c&L_uos(!4jf5?5sUay$#2_tBpF|NGGAXF6=-S70}Ps827!wl8Uu>C5rE04gK&9
z=!*sii_2&Oq!X&^GImh}FTDd<5efvK1Ee+raB@#4zOi}5yb=M{L}L4q!}icrz1-no
z6Rp~1m)wQ<Xy2}8mwgshKAzTnz5FKF1FOfinAme$7xJK^Xwo$8fH!j*^};X%c9Cya
zpvhsww7rBz?-RCHBb%7cy=PYUB`f05x}k;`FKS;TQksD_*$-W*E@!?)@20;(Sv8Xj
zG4cAs!&`yZ*$smHm9Agt7*>2@Y!mD+rQfcp-j5&8NOE$B$5um?KyTn~#%D;<2lbUq
zDL@#lV<KvV`=Js*=vU4a@2@Ih3b{%-O6rM{v`$jjdo6G&Pk~C%4Z&DT1|ZU;=tks?
zt+rET=dlu-v(9I`B^z3NJ>4!^zRjjCX(XIsKD8?s%4e7}?qL1EUrKn5ulK{OOj8c_
z`!DDxq+X5U%)TCV>BFgfJ>0(OkK*}Zhf6q!!n<N{b&<PTVLGTT1MGaxWo9RIH{Fj|
zXxl#m3`f{H4I}jVC(!;OB1dfp3J8Af1G<H@?^&~*nT2>AZr=M@c9ByK_Z^X*&}VI5
zZkI%3EMAhaDZw?HrUxD4NuevN3+^?mdvYR8u2{}jmp##Z+Zijs-S_L487xuLH>rLt
z6fmbHYA|I1_svXBy8k%I1csc^f7m(_LaN#8Bl)aiy?ItS_}%L|G3UM3Z2An5jKgAE
zhGZO-mr)7>Tir4JaAC015SuI=dXGH2V3l;1Vwi+5e)?4x6{$mwYxA_tY_Pn?7w;^S
z=69FnXH&`pQ|Z=yH}tR0qo>WOmVguu4Ai=uQ9(69)cE0$_H*TcZtsjqKJxa{?mQ2h
z_OTjO%eMR(i~WPj;YSzymHD44pXXc=qQXdtPno?}k>f^cJzZo~XeL~mq{nWMujx56
z7*rlG{N}-8OUTi__`3%LKOM?^f7Spad1ljU1xJ&{BZr_3YdFk|{8F#OHwq$P{954&
zZPO6`U6ttvG#1oHwrW-JOBZ(EBh9ACQsz<8*mKaC6g;(W0vjwiHMG!#p1)<@pRb>r
zx?pPTHH>WvQSpXdj$Pmo>2x{1>!(c@?le#J(kag$x`l_sDl+e(=F{?{k~-v>AMuU4
zU9IgBe&hk0h8A*DqhOrg%beD_(a;FV+L1Fn>5eKJ!gGp_`n)Zj^N>RrKyhisou*5V
z2DFGn$yjJR)R#y)VJ{zzF>kaNw$wNFHa_LKr5vzVduf+gahCgqT+4VDt9w(qD==Hw
z`OGQ7KJ1G`k{HF7nn-ExO~ohW=k@V<bLe~?3tnhyNg-N!-d&MqRa`IM=*-YhMl_ys
ze=~T)kS;=`8h0=fk-|YJ{s_U{xCQ@ly6dy(`(C(!;NGC8c3+Yrr7ph7tYq{={j@r9
z<#A(k-e-}=jf=N3`!Rr;e<wBx-e5=<8YSXaY{{d&jscTK{^<yJaD@}}@iG>ToiJ>C
zx$YV8*236s@0AcPR+nJ;oNwqCsNSJz|9RcMwRjK*A|=y|15xZ9+JScR_vW#?Jp(^@
z1sc5~T2UQ?8*QV<(<bEu-(5|gyYMjBSO+-(2!a;<4JZNIP2a&qm|S2!g$XZ>PQ(=*
z8f*7Iw_s{$m##c&_j5XBR`Oz&Uyju@Maj>wY&Yv>Z7{P73oe*AXew3!5l#!Cdwigu
zDWUJB$YE3TYIJ)#Ktm3CWjW~iI(NapweY@j0rYKJ$e}%1`xBNVZe7!Oud}_3C?gMn
zJ!TK}p@1&B4=RYKA0{_Uh{FZ-29a|!cJjj|i7xr;pUOLq?+@{;x_B|LQ&`wWdk_Jx
zDS5q2WdOu?5X@n?oS72x{f$K2>>3|X8^#epvAbSYlbHB}q<<`FWXQQCQ{>~kjdH2t
z0dAMYKI8&mOFh&9KL0Cf1(p?>J%N<NHS{wCK3zVYByHaIU4eTY!5{q#*swl3=VNf)
zHjS~W)orH1>_o|#$j~f<u#H?Rd=$k|!ux9Cd1&7a0ZgTwtNFN&YsGxLUvkMy7)~?0
zJ^c1W)?+NYZUB}YAwkB?5B1|?;dNo8p~~n2LS+dVO~53X=wgC=_{rnc5?@3UC-G>D
zbjp#YgD3ADpVVqq+Hw{-|9H#p)>UwRt}yO)umk2)J-~)`?9!a<XyZ__bVj@wf#OmY
zA~%6ywG;8Ic-1)ebY!+mtX8|%;Xvi!M?`<e3$QdF2F7A2+8%@r0}TrpN_IhNQVe@{
zNw^PZd(;n$oU9*R7+c97G<m64I6YxH=gW!KnT9-8lE=~15CSa>u%Yl`CL438`9}VC
zN5I&+d1Qv6Kr{L%$!VCB`lcTCX>Mw>LbHg3tw}DlkHk;lz9iCRrN|q<{6A_w4=Y3m
zpUUDpHP(pcpGt1HNvn3NY#aTYAK+rum^0SS|BbaS?a5h)3Ga6>B#(iz{Cc!k-E2Qt
z`@c)JN~B9M8s~j=N*H%4Jp-{pS?Wm3f)xtxi`d+(?dNpd|6%XV!=e8F{?Umf`%c!W
zD9KhKg=Q)t`AA8~(o_->vXe1W3Rwq*PsCIRS;kJt#8{F__T89K_GN~QH?wqJpYQpd
zbKm#(`#IM+_ql)9xqjEV?mwnWbItOa_xtsFu8+s#sc0EK#&qF)!N{;Huy?HXaK3Pk
z)Tgv`wbcmS@qXjn?Q_uBY_HS9G1)U?Ng@}vWX#zK0O){)V>&Ph_VeE`kM7ud)u%{`
z3T^4SQctyz{uxi(3Y)VyUq7K9eeJy?sk@Fe_=$XNiBCUlHv%Gsvlgl9OBi2IIPId(
za6d4;J6D=iM7ZV&pfV~v3iULa8J!KkaknPpFsjv>zOL)5{3>RjseKweNy}gzPP?S&
z(uR@(G&yVqBd_%C(2={N-7zd7!TRT#aj8MIb#Y&9jkX@bf|Y{(QFX!CimS`rEQuGy
z8$E<K<FzEKzu-e-SXV>fQeClk*o39YJ3+IX*>;(IKgq1C>$tXV=pd$qj$|BwPBHfz
zPoKZ%2H6$6%N1u7_Ql7bbxo50@Z=qpY@6A72|vnkd0li>Y9stYf=M{)N;7imEhw5c
z@_?-ds2CCVLcUoIV_OChyN9X^bx}gjcSjX-k8fA^%LJR>>+KGLrFqnR{tat-F&yPL
zB73muFx10TgC<uQS1X}wOjS<nHkq`l1UZc?C`r>3YRW08176jRW@e6=am}HOn7C#3
z%#2~30u07UA$lMmVug`+yCbfvK|APo`zJ!MZ8g>-&WESYKiaa3Pr6Sp6U<jM%fnzo
zR2k>N;xZup^A!NJP*t5e^hB4MY*J`Nv3I2#Iw<?fJAY`dJr)TIUA{M>Ah~)$N>}o*
z`wh-0pz)*xiYT$vX|>Dpv`~Z|%fCM$6lFs5s&3RT|5))xe#9+t#b*i9>^0e^rKP{5
zzIKE;Kqo9o`B1wfff@io4vl~dmS6<u$f5{&lhW&u-Ng?P3j-^~gz2bBdSpd0-yyTC
z<L9_=M{z1cbJ*o;Oi^?=y<^#tvN~qLFoEJ+r%-b9^@1Gf{=I`$q@wwu2R7>mnlFo{
zDa*&W7$=2}8n|R&xrkHQM5!i4%!CIskmUfC=qCnt;dddUv`bujr^{p9!ct$FyYG8;
zPx@1V)D73<O=%akZ-=Ze9AD!!KDuC3Lo}vZK`ON$eIki9?a83hJjpIY8|I<U(Fw6e
zZtvzUn^6t-=<TUWP`xmwU~5ZrQZA`y&H6n&mg(O=`&Xr22YsAo)=VB%XEa0CXoDX;
zK#vr$*Ms2@p3$jN<m025xii7RHh45Ak|(j};SreBe1;oK!jAERoeiW&hFbAuX6=qX
zFi4$hnt!meIK<!n`?G6_9hd5;o(X!aIrN*)1+Q=0ao_!sYT~dBXYjEj0<pvRHmtOf
zMM=mule!Zie>aLwd0R|akDl;J3DIN~#&`3Hop*g+Nk|t|m%GXJ@=lE9wZ%2Quizh~
zDd?Qdp6u>`D`7vkz;|Gm(jd}r*ac0`Z6(5NbO(N|4A35!2bzumVqBa9C%UKx--5&P
zvi1PpjcvexAm99Zl>GmjzeCw%*7G9d_8O>~ot2uCn%%^%Sr`9?Eq=CQ@4j;S0bu!@
z7x_<W;0^t+T!AGdVSR9;LoMVo!criemr+W;oBJKoH_Kid9r?!dbzWZc#x$?^mrv0O
z56zcRC;&?Zslb<h-i~e@s*;x_@m8B`AF|li8$xPL5<G^8qv$%~w61b~42kU7<PSe_
zAE*2LQf9^bNMMo@*nb+iG7T}EL6b+J@zqc|irTisIZ6NeYk{Y=zNQSlsz&cyqF1!r
zpx;<BPxDG`N1oqxLb7_}e#8?S+fVC`4Rv;U)7moba1qwNcOk27{^6MxJzfW!8lEd7
zK52e$sOjfPlvIML_SQRi9<F$PW4H27r9X?rpiTNFC7mUX<?u_;-XL+=YINS{o;Q_p
z3AM4mKA39i*q7}%W9PxG@ow(ClG7*haT)LX>T7WwlY5~pdLL(Z7zOwAy1~Fk_*qg5
z23hmLIqdVX1$`x%J|EXu`K^@G`k@!B@1@?bJ#zjR8=|9F67ql)J?#S$#FiGie)Dgb
zpwr#ARt9#ClCf$(#^C|9jvCocSJ`7Zw|&1Sqdd)C&n3+jQR<4PoKNm!9;p^9cye|Z
zS^LiMrLp#L7X@44XTePw0f!k#VRRT%$rE0MZBe)Ik;jZUWD}>zVqabLN}1t@PiXxR
zZ)beY3p(m@@XOwP6{p*RQPrk-Z$n%-0usz)ca`?OPy9AGOS-P!N682h%T2#nR&kE+
z<7D&2fz^0^rbsiHW{Drx2mT(X$vGHeNU?q7Qamy)(B+Xv_H!IRwBvS^^ua?fy#)F9
zCy|eA#po>q;{P4&Lu}P%w3m?@o!L^Obf~<n&avTYqjA%uYsXi~*sI@P-mEDTB&y%`
zwHj#%CqHyzWEzfXnht1J&F9oboOzydP&nS;zTIQ!>N{$a0tgxPTeP`7@InaL`TOqm
z!U;0Tt}gdLVf%Jk3ob|nZ|zxKVy+2r868W=Rl+)<6$$N45(5rgz44ffynk46_KDeV
zMx0dUo#Ic;eGvwBnP@EtNvoVnf8@p4!`eq<<Dyun?%sB<Vk4v?CjQ9xPl|<gNp$8;
zud1>G;KplLIJ-LpEx}gxq&+`i+R+nupQNBrHA2Y5x{^nSTFK?zEKvf5+-Ajh&2e&}
zVxl(s*OqptqrCKo0k@h&>a@?-AC81CHZFU2B&lz1Qf9%uk0kOT6vxoY&`k<ck((s#
zYa}|KcBJ@tr^1WL*OEKdZ_m8{st<%Ucyd_^Aj%6Zrv;@n@vQxbWAc<-UOIkZ#!ShA
zt4^z}K5(YeX7s{@6mj1;#}^bQ3npc(U@T4O22m30USdbJ!|CA^Li|<~_l(<*c=(-D
zbxT?4v#i7gxIdcS-EGuh!^-EyeZWqYY#$&mpQ0kUF(mSiH#J~|q)-qg^PP%nP^maQ
zDbc5K4ZXN}O!S&%r@+1Bbz<-zPxLlrBCu20@Um4BQ3%_zkD?tp36J<V;V@azbyrJq
zwtVkcrTOtK`OnISirda~a;pq|cTjEajs*Pg)MB*D_g@0)K2jkOyS=IvukMF?zK<fa
zoqSZSV@4iWGpua3hl3HjfzvvVK21Ab%Tygh@9U`wY1E<OqiUZ)VQmi#Ds{i#?J<;9
zO{y=d9o>3)Qej2eNCER|2*fU;9R39af4`A2J)Lox))83@_HJrPPCsG8{bz(av|ky4
zP9udG9NOaJ%{Cl*%5LEB1J`nz9Epb{-eKwW)gigX=IhZc8QU2lB*PvGD@GlO?d_oE
zN0w?G^cLUqZ24sHDJe6B?8E6S8MUQA!iZf9Qlj{_Tt_5JfG%-nsPTM_w!(sL!C=U!
zF{l;_bc=D{mv=;@Zr^;vGN7xr!yomr4A~V`+QOdxcd6&APaSJhsqf8ug19B`ZRdM4
z^4|2%8~nfWSxW3Qq#N!bN_Z6^fmWuaCW2D!nYf8z9uI2W9h3MMXzI_kSjiRDy*8hP
z$X`70*<_k60f*(n>;?N@AMHU|-R-&U^OVq}GEbY&maZtzHjLR27#ndB)i|oA*{tk4
z#%zKJDPZHg#qhFU>1L9LXDRrWQ`UoBR3n4TdE1PvHT?d`O{RKh+RyZ3@CmIgN!M>b
zxo1|l$+W*XHB1q-6ck02;z+Z!zX$k;wyw~w8++A_R{c5dHPCgS3G7LYtA^Vz*|OBt
zC&1>h!l_mw5UIUhDy{Hci1p7^-LRrRZO!j_s~F3h3y3|x^`zA7?g7isn`;7NE^9g*
zzJYuaQ<&3}eFE%-K-T#WVN7<=52(9^bFj(8qFcNpSbCAJvm-{U<aWYH6!qLWr4G0_
z2i(lvO<LHNVW4t(fysy-#E6A7SkS)}7CA#<OEs#?whuD5(rSA?^a2!!=&L2l*+iH!
zvKJYT5oH~LBB^*GXu5q=Dm%Y81VOd<AkGLOS>DwBEHx>;5IJzR#voBd>L&<GZmo`3
zF|zxY^}2Bw?q0NIfxYRz+Q#?MUj4qiM1IEPs2%)*d*9$8yQg_?zQyf&p6<@o1Y{*V
zn&?J^4&s(Asm{t&BeDdcF-TkQ6v`%5b9$^`?0IdLuh*K-I1ve(X-b0ni}YEd-8K@S
zicP<B1F4S;Hpju!cAZjR&5~f;U}rVz(EQdJVL+RuDokYO-NTE)ZyZN+gMyVjc)lbT
z>=^kRu{5G#rZVIkPOGG}=hK?O(Qwer>v)peXPP+rM{-p4aW16FX&0l=TSsJ+o(~v$
zjW+wQ<+Kp_u&zEpl#zF771Xq%=?Vy$K_W$|XG5EBLF}&D+q-EQ30b+8v%{ZSMUEMj
zo)OG{c!mlxQnzl<7^wyY-O<%nEDaC<<XGqWFo)`)<38f<810mSU3s}*4jwV_DQc4l
z_4~2xHLk;zk38PbR;RBn6#>-%Ysf8+B@e|+dC6R(HN{9CnIH8U$o1jLIL+0*ch48c
ztEYK(pD4g^VLl@(@k<|E!*0?9TBMa^oy}X*{T$VG`^s!CH(e7rPukrm-EQjpvS(@H
zyAwo1Yl00#nsTFVl5nhp-dJcO97l6!xk90|53}sj#%r(cW{d~>N%#L+#4Z)_)P%oL
zQP+8S!fV9v{U@%bhXh@yn&v}s>%i?Yr!7uvJ$9oy^Lw~tSO`F^Wk$(<kx8_mY|{tu
zJcq~{sqRnRkSeMb9?bw9TJo~LuP?zPxuXzTv^xIB(EWGgxbj2no{5-~(?7pr$Sw+Y
zt=IQ}#d&f*Vg%_z>2XHtw8t%j25DIa&W=td`RA0>-t5gGDB3)VWd@{Nv^?zVaqTe}
zqfsRa7^6F8$<i2^5cTO`Q2Q5UDL~1$GZG(B6AY#;`Ze(@oj!h%3SkNgg$jkR#M6Xy
z?9@Z7EZWtFCen_jSrOxdyThh^Msl2e_>KA$x`JPQ7OX!tXsAB+7%ZstoG?##2MYIK
zut9YR)G2sYAQQUxFdGlPsj4G#Lu7|%D0N@S+#~N_^G_8*Q?$C=*GI^-b^pUo{aZ}M
zJ)zfzIeF@_^i|d)2~I;BYq7diy@R;6(`tKFW}U#J814CYg2bqASeA@6gG|9L%Q9s+
z{RrK!_(v=u`l*s?_SNWZ#LN1xb+xq_6wcE9t;fPkr?0DTA1T$L%8&>!v;*yW4sDv6
z7UM!iiY(AF!zT2tYb}Z_p5km$)VP(k%Ox}F%a0`t;JEq#bWzjcv!;dlKyhiwTj!py
zLalzLiwniYXW$p!v){k|4eQ>hWwWj5gs)y^93il=BQzH5Dp1Jg+S+3^tB3nq>)E*i
zOG%`HTzBM%3GBgDZ72%I*-iKjI~TPM+Aw5s>{@RbL&+_ZoylJP$Ue>)|Na{`yFJF?
z<U>9G;Tyi-|K_c@zkBZH419z*$r9=FS^g@*!IltL7qAKNu?#rtH_Q=B22~3=R_AZn
zQ+Hs{V}a`3c^@RLgbpZXnuq@WS%%Wh|6bmIU%h|Yw*Sxfo>O&0Rg+y9pUX?Lgqh>Q
zGN*JBUx2=67_db>AOey44Vz1v#8MooRvZVrujHy#mZ<g=d8uP+9}}=~;8n|4D6{S4
z=zH7Wp1JFlSY0ir%bkt>+Uq^luYC}uHLoN0Ty0%cUi~{m=~k$B#(DNw?un~z-cpj4
zO3h~0;V{B8;!WZx`6IE`Y^j?<7}cgMS&zOgt#BaEuji-x<}ajq-)yK#<vZY@min-J
z(y|y~yNjsG`-n(k0oj2VBN1$|r!5$RgKD;@>D80$SuNEaHVULZKYZuW+ebY=%F1AZ
z?N{T<Zr|Une5hrmLs7vfA7P1`<7N6A1McXe4|5w2D(^GVfJQb*-rPX86OUo3a!6=D
zku5IY%~D4L#8%e@<i9>OG#r{m{Qu%8_o;xamYR^4#u)(G(mH!#A{}{{JJ?oTEX@SF
zt)h>IY#~t8NP8a55dy%P2z9Y4*1m@3X^trwq)=<Uxc|fC?4_;71c)S(Y4=f&SDU1q
zNu~B^Rmic(glf1bd#kRxs(z?6M>_4>+^mG;qaSMVk<A+&zXDNkpp9N0y;RN=4RNu8
zP|QOR{12LGg3+qpXM0oh;`_5N&Ha7%g|4W;ri^+Gd0HMn<OzT5*nZqxDe^En>GBfL
zQPDa5=D#v)=%3z8KsEr{S*jb*skg??o&CV|AJKL0BH-K9c5rUs|5bj=ZiD&vVqGxo
ze`a`6GFqPAH!U^GQjntJM|u}qgDXO$ho;DkaRXWGFWr7@Rq=l2X(w#)+K<3*501V~
z!i{C9ue(ueSyChhFQiu#lG8GIl>U6R6f(AV=3<(a_xh~rN%uQwpD^KRJifGE+N*CV
z>Mok3IQW6YFk;ss@2rkBP=3y~;zx*1Ue<jUAD&tEYXS|MiVx-WamB&XeTU6ul~X@Y
zC1wc=9H~dtydkO~bg<2yKn8-yjX30q-)&?^oiMvJh*sB6F8vL=<?fJO;+a=Ay2r`M
z!O?>M`8UO@Bj4kXJ#Gv4pcgTQ*y&g|><Fie@dhGN7rj(JZodplv0vai=`FXuydXE>
zHT6g-G~48){d59e_Qcdwr9jTY1NHS<#&xg*^D*uCYGP|BMv$dDxFMw9u6EBy`8&%v
z@49<!t@P&|$Hwx0T#gwqn-r5ibn?b7_42>uXL;c4c;p?crw)V`T%wXL?krsz_hxi)
z-Ci%NzqiXZOt*#>{?b^0{nkSz%EU25zdQ2V*E-#6OEL(`8=@8(*eRYg@ZAqW#wfut
z81*yL=2cx-K9t$p1&2F1Z?8Knss|p~b?8l>1HAfMFBTT1`c33@$9qf%k#-3+;>K8b
zjItV;b<SSDOv^1dE}nNtNNvw77=E$2W$Nhp=X8=rw&T4U{8_nf%94I%^@v_uTW0==
z>DWMwJ%$8?P;eXtcO<-NKTDbp+%)>iTlZ|eA#c{5QObv`fCTG)*;TJ)<qSy|!Sf`g
z)5$0W9X($qBd}aLwpj<wN6FG@>zY1g=2X^SHCG+<%<Ox$dRt}LpNf^&C2Ph!KXHHl
zGneF-m5I6G#}LS2$i>oS|EkLek#`I+Ue(is(nUz^ed)jqK8di1wAgWbN=z-e4E94%
zL4j4*Ew%N&6x;lOIF-SXs4uIfr$0qo<{qrlcV8Qa46l#8Gesu$xjvS)+hepN=?vGC
zKb_$fy0$Wrjpn%_T_l6xI2K~{Kva>Z(9cS9`IZ^k?*<YIkDP^jj}^a;KD&eP4Fv~Y
z;s*8xKAMppg_4@|luN`@H7!~;<!qC4QP&ralIQx^+04MI`=N0>4Q~27cROkA5B7R@
zE((*`gW}`#k!jY*5gCRl-2_MKjzjEyk8gYMoNIwh+CG>SSbUdw=OlZ5bfirqsco%B
z<@x@UTrMlAC1NFN`5KLz(EtH6AP~$z(<KcUF$!xAPWtHwsu<}<T)sriOa>-;hF(z)
z@Ox9XQ{hIe2JiP@c0W~F2oi@6v=s&RuTn;#Ny_WK)l7@`Lqq3@KEiDw303W*1+~mQ
zuy|QCSw%^IHg>ZUNG_wwe;~`NkhQ<E4<}<Jx*1*>;K5jDG_;n*u_ZuH##8ovvdeGS
zmG)qTLthmLWu>^<IRXq~GcTb<+36ha1ppsvXWy*C^0+g4oF~X7C2c*4va#>r2PaP+
zJgITQzEdexzXLbwKV`MlN%LErUhS|FJ40FJzt$m9{o~Whw>uvb_DAV?<fX20naJtX
zo~|m^nRF*aVDE@PYF0}*OotH0B1M%HZgHiTb<^X{XgWjYX;tP2X^x}R4WdVcVO@@d
z$@;-ROk#PRDdt>t1mFj3@xw$$I8;=~lA&wm7cV{lI`V`1L$_!%G2z8+ImM)-><Y*8
zw-fhI7=36r`K0kWH2*V0s%P8I1Ow18%{JI60w^2Z!H=wdL>z;7sTJGZxIj&A>)sVg
z&&uku7TQqrmpPhxwbotQs&aKiqT)-(%D#L2Yy);C769qG7-dk{JC-%1S3F_y+TCA1
zY_Y`gj}JXNwF(qc>f@!qTvFo=;o&7n`ZYLdf8VZ;-73J0;45Th;gV<&5I@ee;+(Vq
zIIeW&er(TW8lEGpiG%>g=fY)_9srEFaz5dI+5F>`ap09^tmCMq7(hHRYr#S%Kt*Tz
z`Ho+CQtPOHeIEPI?>(*5Gfmi%*bv#KdR`%s*I?zpVT2IkFENy6$QYx8opO+t#k!0J
zIuu8rJls|8foID)`Lv1vwNDS|5l#IX6h?t3Dk;*^%1hj@%rhPgq?OzrGVuD^y6eQs
zExTtPDZBRdA8EI_gFlT}CCms|514fzp%WzP+xI~YB~81~8sAw*y$uhn5Y_s#KD>C9
z>CthiX#P5gS$4hyHst-n_G)_^;AqV*|AtY$q*teXqN2zmP`<_cC}-sk1N)tA{`csL
zF=6G{aG&OT;yLR3Rb#Nt$QAiEIL$_6Y?>-WAsb~I(iri3HpDy(&}9Wm!+SAY^)#^$
zA%GAiSmdz_(*>AIzoP$5?%fIAii|}GA;jjev}b5*`mQX>S3^US_B?;Z;&a=(k_wQK
zt}In(j21TC5!D@!;fHc8=+~cDgqp5is83uGWg2^F%I;Mx7Ct4UUh;5*U%eH5jcEZW
zCET>H<an0nbb^R^ZveBiYB95+0lF2X-f$nN;pxCs;<>IF6ez^5viPQd!?@9xXdh}5
zn&cN44_0s?d=ZS6+jS9^`MjR558pRERL@O#)8s)PTE5KGchA<Whv3+0HdkMJ?GdBn
z+d2zJ@|c$UG0V1^ludEd@4Ft{YO(>BQx|;!*l^ITRaGScE-?DipjP#=H*Mi;*y7N-
zTasDWvC<i%tamUalH~i;Ei;#mU9oGcaY*?lVe~HA5=Ijmp=PPKJh)Bx2{hB4J3J&#
zaHKr=BTjc*Ol_cn8-ygS#&Q&++sJ}?4I-lR!F$$LsurKTkF1>QNkQ@lJMLaSM8E@!
zau@Kl;x0jdA_VQxXJ|{4n%XKZW|pShVNK#!hdzrm9JYx#nq=}|vV2;bFC^H$uMtUD
z2&%CgCE^+Kp+D_^)G;+M_a+_6c^V5i!5tIVFY?Ho4ydw=W*M>HV%>b=vbFmWaF!<h
zTO?J}-+Y46C1k*n&wP5^#V{u!X3wW?1v?A6#k774^3f!~jX!CaR*k9Z7VrpFUUtaq
z4@!15JukHH^2c9q7KQ|r(oHZ(cYKu|xz!3%Cq|+*Y2<cp2byppE&tL$%4Ey3Ti@Gy
z3+8R}_LfU^rhDPWsJBR2j7Esz6pj>rg^EnnQ@7Om*1{*N@*&AY;*02?)1B?tK7G90
zxomEJbQtUoiuSMwV}gcS&Y_94WFyk8dv4GZ3Av?t@7xW%zNcA8)G4baWaV8dwh^EB
zL~y5*mpzzTEG>F5Qi!9>GN!FWQu#()a2K2UUq$?=bm8uD6UsK*^V-~j;xgx;_H<su
zQ{&vRBZ9epyHD+AwD|*64@<rn%DhaoefG}5X;IFgb#w6O=Mvw0l@Tt-Ooe|6^L_@5
z)puh3Jx;5vgV_R)JEW1qKGhcIHyWUf8KF~eFZ*jYx{2&J%W9K(edCDw`Z|Ztj%mTV
z#x6$Q(KM#w+QW*%zTP6Y8MLS1yu8ysX%@fOX?**J;KPenSFQwx33tqE{v!$F@nRQp
zj$%F`D-kht<W%IuhcrSsm91))y5f~GFO)Z&`Q=gy*A7D&p?<rCtK%L*<7FM2mp{Qz
zyIb3p*%M4KiVI}gB`ciLKk$p!Ih`OF)Q2t`8F*)T&rFDWIy+jI$z1ZzI^%aBAXyS2
zp`vwWROCB~W{Vr+rrIq%X6hIhqj!hhrmsFFaK7g0^?T|D4W@~9h8IjUUgU4Q<Zznf
z2H$%h*xM`1%6FK&YTJ<}A$Ci|X|fpW3eYg+1{gu%l!eK?G)`8*q>;O_Y4*&{;C(WY
z@;KOi^@+RRBVrG|{l`ozR2-b<Y+*>?$8;&}W*6vM@9*abpfAwyaV0Yf$KFkrJbM@}
zGMb8GK4cI^AFzN0j~Iq9+Ts9)iqIdqfSe|iQYHzNBL)}qX-8u_#N>NZ*0h9s&3@tY
z$)gKtqIZKdR3fN)X~z}$PkchA`c3{Dxb!{|OncI*W$L35gKtlU5rsT-j&7z59_Lg~
z{5w|YTEVZ2ga0Q3sC|%SMp+dcWjC}2Gq%x+MqMv#53q(GT@N5viF~y2!cB+6#KxEx
z$iHE6uSBlDX?)C)6PbduMU?&um{Mzj>a7_|7yc2stz?7*-b~$xqxJ9<CU#8(atDx$
zUq&I=->_Y_WEN1o`Hls(<GB!EJvf3TM5rFNv8@g!0>9$y_zn9`2dzDptRt2lfF+$_
zaRY+UG_YIHu^M1&=D3IE2%wP=^%&y)4KwZlE`#Rk1*}^S@SD6e5l%D?z{{pSkNsnj
zsQ+H%|C(2la}GOIjfWBMBB6sQ5F%rGP-S{w98^gnqYo@BMW{QEnkk@Y@r%cvSI61(
zJvWyg@k#MWv3&LHIA7<{C!A8O3-Bdj6p<wt=J60S5R}N|Yh{AL8TJPnfmzTznA{Ya
zpGu7N_<1MIbY&^b7ZpRM`TmBr!=GS4pcUcn<bjc++u{nGK<C?o!n@{0c$La{TjXuO
zY^MtI3z@R}#WzKcfeSZY^YrngIAbmQf2<AtsH%^3`S{CJH|2ndYQO^_$;*Gd`X6Ng
z?#uyz<JWnMC8}~BJ0I|WKx9P)bZ`CuyV*1M?e<?$#{b;;ecdbI`|_><`GV*8ZBfGM
ze<~qlaDNBv`P2V-C5FG9f|xZ4U^1yM8*c)YK~RLBAG0L%z@dMc)cwi>Rc_Ye3};aL
zu|7=44}U;IHne=!zTPoia5~+DkjtV@EB1}A_oj!}3r%`YyqA9<N?kgL`@9Mrz&=51
z{#DOE$F4*B`sO>D7mq?kW))FeZ?5<%R+U{`74J;=9RDfrxq^7kcRM`hb9DQpARCMi
z%VCgrinbBXW|-#We0*6rFw^rkLtmh}eYWl%KDiQK|G;i(^CNP2jDlL4rDsy}JCIxx
zcdsnMWj55E?Hm%<?0?k8dg_LJcRYM9J}&yC@V5syVzJEx&~kM9NFG(D;+J)%m?al{
zMA6Q8%euig^=bOmF;#VlZjq6mTc)S4toj6Z&Od6ql2nWd#5Ok?U|Yh-kP4ABw#prz
zu|0j2)BCK|npI(mc)RHzO7=L>g=l%`^P{7-)DKNyFo->Pj({r|r~@#t3g%O~6?Cb|
z1g|4D6+WVE_d0hOql@X7sne3)*A!1v%2B67n4H~_%oCm%J-T*GlTgSeRlAw3-CkjV
zAW~MjLLOC>$ZdNN2hV$<3&oE=gIdFE3J*)}I=PLV2B0Zph6S_<j*ANrE~rWHmyVfo
zU+##i7BGCjZ{SyGiH(Z?=;V*jDX?8jD|Va%dtm}a<C=N6N64w#jBFgm=P9xR7t2yn
zj-#G{KIAyerL;W?7)4WZ`ulKMub74$y4h#>o2TC8bw?P<Qwh*XM7V$J=+)BKp|1eE
zSgUHJbpJJ_-rxbBoMY$g(TSH*$Lmg<Ae0dA;2{-qv(?Dvv}OLX#57VqWR|)a|0~n{
zUX_;xRjDelqj&#{O4Z@V?b#ORj}(v75YF%QHh73jSX$E<<6AwuUb(&>OF7a=D<M9f
z9PffNLdt61r_I}&{S3|a-m;fo4_f{>YS0r{X3vVJS+#UWZ(?bF<b+1o0*~Xh2C2km
zraIH(XQb@%O4Eo?d75(V;boDPE;qxF`L+OL<@Ny_OTm%8B5*+Lz$!^`S7?hM-+@Gn
z09zkxnV-CUvFuB|SQm0+z5a6gJI;7btwRhwia>eVsH)ds0hUr88~E~A`+jl0-~#!^
zmO74HAq6$uL9u!aV4*}w&Un4^VD97*deCD}%Gg-wf%y4F5BE=>T;k8Eh~GJya$A|5
zY<7&r)$x}@lpt{|Jvsb*0F0?hg##K$xH?FeLLlsv1q#|{_e6KmOYZXtkSFqc>fH4$
z-Yvamo?CC`5rqu7cG}v}2xh>~WKM0eB$!k9X7~=)btsr3KXKg=p5QQ8Ub<KkXk?{g
zXE1(Nwyn_q(*DD^iD+@r2hVyxz!?EREnf~mwJQb_fcib+eZQ8`(bw~R^J(iS3x)J3
z`KX{<-;&!SEB7^vJ?e{9c%;1boK_ntL*-@3`A{;wjIYO;h3lC2Rjd04ICeJTtp<C;
z!glR0Zk@ak<0Hx)<1DiGR?{YwiDiOvPxHCPD<B@;c5Aj0|0~gr^rtSXw>2FYGO}bC
z22?oic9OS5c7^8Ed2~f;yg5?%a6F~x1uz>2*?5i=rUgzj>B0_!k1+^bsikogjnC8A
zGv!4^lbx^TE6F9T8I7P&c;3GHSwYA(sk4?p(|FGCf)f3~#VPy}ooOf{hiS#v*0yTP
zqfg(CK<*5=B_e4bQa^iz_@d^_y#TUW*7O}7`EQpc?MWi(Ue^7N@L@|Vf=oM0hJ_ej
z1X-CEz-U{=1Ws5@r^nx0(e{1eFgeP(qHr2tAA2ov;A3a!4DmifXp(iIzeyCby9|Bx
z^m3Wz`xPdu*X5~GKdNZ{+So5yqBFsDT=X|AfS*nt@nImqzzSu!5<JSzA)%Y4VI_Sx
z`&d8dcDg1yXG9UyGHyqA#hxmQLhi4b;VzQ<>Rx;mH`~pyY+yI)5g|3RWh_-><c)UJ
zGw2PujZq)C4$XL31|Iqq7v{?3cgV}Ah_iZ;xFTTQOt~We8>WbK#f=3soY<*YZ>!Nt
z>HztPW*|APNvW?ir>{{+v){I+VxF<;5vsrKbm;TC$-cC+0y2Ih9a2ngCZHSXAa-;5
ze#4{#z&He@ZWP^$<nwgfE2dAPO^I(nI2Egxz1+pq@rm9odMZmNQRm^&C3vOTQt%Xc
zgg_~+rHxuih162zpfsvu!j(((_DZelF#nm>vMpEqJ&!#xwnxg_UtH38$=`I-eyf{t
zga&W^0MHmarr>;#zExj$q&7m<!Ab+Bk=C2}!N}6D*Zb40wT7xp;?tKy>l=Xzpkdnl
z%Hj5_+s3;hjs60D+=t!`OyDZsrG!UfsuuAyW04WFkJtI>$6w@!5UG-6;haRKO$`*R
z8sja|QnOwc8dOqz_N3f_Q$VmqEQ`6`uFVpAHiakojJk1l>(Mr+tlABE1^^ytRb)Xf
zD979UEbVLHi38W}U+$+Q4++L|Ywea?sg<f10wKsclu4?=)6T9cPCBE`C^%$C=92Dw
zp`2U0K$Emz-;CRl#Cvm2y$uY7tU=1_5yxB@Y9LxBSzv{oO$<F#JibGp#<4fd2_#p6
zQNF<m=lGU=hK`go;ya=gMS6R&%`oC^kgyyWzzDOP=-sV8aF5ytp7REdA<tUjqsU?D
zuaQ^sNE!zxN6$SCwNQQDcKJ`O9Us}s?0j%SR!5i&5_Ay1qshoasBm7oUG`q-{-ow`
z7k}@gQf*c9QL(p|%y@ISa$}$OmMvlKLeUHx@RQ!5{n$kaK2{LK^rz*UmqAvg<9Cl)
zP#hCn_0;3knf_jZ_ukG(h?&Lz$hmR!lK!D$Al~<dSS!Pl=$b~hVMmJ?iXiq&f5_Q?
zSDQ+df<#(%I&Z$~2$wWGU~utOoC>xv!OqVqXUeo7?pX#?n*nM$5uPDFQ+936+S9zT
z7%lWAD6<g4s8M@0SuRu$pL_Q>J@RMQ!m}kKPX0W4^z0DV<(`kiQqef{U%2jLEJZr*
zDa$H&3K@%)VC|rg<tLmbp~obIB3&cdeY0}%$Ha-KfUGzAvinh*50zm&{4lMEjus-Q
zF^xV3V$mR?+=+)YnU~QTwfKv)!4GtqX8l`}A&gV$BO4~M?zwzxxPvzCYlthgKc}VG
z`ypa;lNj`Ds>JG}5~9zmx<tUaB8_yZtTEwEZGH87V})`Hd9KpLCq-&|_EMY7pmOl}
z3+(5JBb+um=VSI>+U77v-of6{gK^x}?|EA1J=%>FSybq{i>sjQ=88|F{J{qJh$_Rb
zp8dRWUqQ&lRjOS=(}9}T23Cx2`MdbhT4|2M?v}G0Cs)yT5g%u@(-tmXsjiYU`$nYw
z#6l`Kwmgos$`W$`upUV)DHY^VDac>mND(5<#Pc_-)oy-5N*Q?+Ndag!nmK%1A0ADH
z_E&?f40Q6Nfj4Sa!Oo3i;Ml3yA<#1*^wl5YUYFn~0Q|wuWFW5U3x$KLu8!jD$3cEX
zmbe)wp@Xu?*<FH#V%dgo9wJp)Ai~^v1H9;uk&x@#3qy<m2&<6XdZA*@_7!lMXgeM-
zFqH2qcu^8$NRxMjI>VTnkXfrS>4C~<(R@8Js$^wDzPs}5?W2w@g)a`EZ%PL0ZBPGw
zsnhBeysLvHZpGQ1H&x^_%8!B$`z&9k6_Ln7XuAhjS-dpzxr*4qjuu;W&byhVbMLhD
z?5?DIIV$YG$#{Ag+idjO2c(+K766sJb3CDvL>D+Wa6G%|dZCNPQ_K@h?A5ERrul6r
z{7&o?)hDq@lIU}oJPj<OAk_XPD74C!(;28lxf!I~m%TCcFsKB+_&U{fMQ7u4>>)z#
z2_T^H53vrBFUT>M4$uxB%%T^yAh_tLmY+}?J;)0|T|E8$RaFF)I%ajCcD(aIr|8(n
z>-)2BZpCrE6Vam+mpr|u0YbeQ)fhH4*3Eqltx-u=C4Dt?Y@p4gVih;4YWadS_=5LE
zouDKJog40y{<-7Kq=iEG0MioFhLvm#WS*PDSCf}&r|XQkr<sbfjd}&uob3A4sGQsv
ze|*A8+Z2~I=qML%HC?(m!1W3eVD1X3rI;<dg)yC3XCPbwG}~%Gj!HM~ZzAu!)LNl2
z<0|cY_FWC5{Ku0gFXUCq{mhFMP{+y1cD6_)aw;8P>GP2o4~G*yR-dAzSULdAWN{wK
zFPg7Gj^6ctRa5KttSpv~FH-Tx$xtcoGJd|6hdRGuE<Q4g_)64L1Jm7uyjyhC0eymw
zf0{{3vt{10P#8HkbtZB&marZv@DPl~d4_sNF7H?h(_YG<qT(Ewi*dz-R`Q;SkdjNr
zKiEl5ZwM)&hBeE~@o;s0Mi~2P0P1lc+Ow|_g%yd>#l~M(X&iE^N*U*gkpCpk`)FV@
zXG+V@^i-i~ra8L9|A*DE07H_UZpD%aYr*WJ2S@6XB^MEfv|_}8!N73q5pQj+)j&_z
zdbjo4O1GYNUXA0q@Z`#7g=Zz$?_z-AtJx%v{Q%0}^XQEP8jyCcW?HfK(Aqkd$WuO}
zyjQBBqiV@sOVedBM&|jW7h0>cvU5_(%&Hz%?2fq}w4Xbc>B5j<r=>Hr*e`V9BTb5q
zOg}{~$3Cgq^q(?`GgR~bW9w0um={l+$iH~p)8ZO7=>=V8d6TKf3Tk8*v-kj;?j+6h
z1wGn?X8YJbXCg%XQrd}9<<OrVatzY91gg&R;kpz4Ra!xKK!ZEN8ANgaLO}=3I*75@
za*;?P#R+P#i0_~#$X2!W>7CNP{K2MH$C;!={<CFwqN~R4KMD;)fCXgIlo_>jk+u^}
z>YOiN%psB@Wqcz|A0a>Zt93xvTo%4Rz{dS;taGv7Rn9F*iOQb}sE;^El@N)RpoqjW
zi15I-&|?D2_fLyy0n)vi<7@57jlO5R_@$3_z>--NPD$sDZbkD3cd;J`&;CTkAYk3o
zR>KI?NIoTf312ccKH=`$luY#nT)p{$w4(=_@(z5yct=wOO1X38eNUE|!wS|Lz9Zxr
z6_`7bVPy(=`HYut#y3J}!L^Wg<6Y$|TG+}Z>kXBVnuK&gg$qTIxBWf{0npw$;{f|T
zhd;!jYs#v{3P(4gO`9iZRJHe49V%@+b2ER}Va+%7+CR!8#KrkkVrrHzm+gN-$U(Ym
zj{x1YX?*+YBkUeGy37c9YNjx(@Q;+!0t=SGf=A)+tM{Lb^KDt4g~1MoXAMp?@N{<*
z#;iaH59%MYj6pWe$eD&D$)O^>XX&XOQ9tFJi-QUeW(IO`OB7#dh`6y8%52C~^q7<Z
z9UH6BMUDWH?maZkNd>JmR78_fk4B?j-ooYF!L&Z0UJ$7H{Yjy*@)n_bIDObAfWL5C
znRyA*SqbQl(xCNi8Ry`vfmWrTrpuqcxsDj++PU^<=e_wM-udf}V9Zo2T)fefd3u;L
zh_Ar4!XaC{D6*pO%)rmI_EW}I169L6)|DNf%FCuHx0rX1N`(yPUU}RUXF>R6hd4vS
zHfuj2@*t-g90Ca~RwCCz{Dgccwk6mwp(kf|=e_qb?QyV-ixfO9`%#wAzdE}9u}J`m
zX6l9Xou$t_^C-S%mNPk?E0a{*Q~bSUMxX!V!%v}C`hy71FuSnL2v9IT#V!LK2run8
zvZPjH7&QicG>o!*#;rW|^MuLQR*7>pzQ$|DJGl-zDR%GkS{bWE<z}nVCR3;3f(r<-
zA;}MEg<hJu3;7;L|HzZy889_)*5H(e&=N)kxPeCfDQpCab{;MaTE8N!qxsF#amZ+W
zJK|`PgQ@m`v_p#hoqL_Xaqq9>PC7ee44fC~uOV`GL}F01i`}~*9Cau%fGEER7ys3%
zhZkF@&wJM$8tPkUXqa<>nX7bmIg-EZH_Ud=%^1QP;!V`>(Ddd~7-bO~p`;8w&Vxvl
zJ2<akx7>O0Ija1Rh+q-9ulH5d_O}bXwvP6zRe9;st}6nmYA?Idl+Ej*lS~E9cNryq
z2o>z>f2#BaHmA98!QfD1lDTe>jJ3-TjoY`!Wn4}w&mO$+v@H)FoL4VAgJX&5w<{eU
z#NICPq^v#-bS%MigN!u3x?5mgE^qve+xxzNt+Qv=g@p&^ap^m~rls&qQ<BUQ*rPlS
z&>5zMBd*dNB^$uRMRE2&yFctR%{6NgBnY!^+ZhJDdQn2t<!_)+h8wxUxCOZ|6<NE#
z6Wz#9?tjWKjgkG*m*iGom$Z@=dj2C>IY{A9OCqe)lkuJX0!b<6?7WUv@7pLe(kU?x
z1$49eezhh`L+ieu*Is4WNPXi^Rjx1J^X|vp3}?Fypl@f_50M!Ml^M@~#f?V?u*>t2
z?mcMusr(iqf9Kk<&<A(40`<=t>V1jrsWZyW!$!T=hV~Az{^XPZM?-!}{B`^4wghtz
z=bHg`DVk{nc}_*<Q5R|rPN#*j&)gCXujee-W>fthzfLhz9ljAn_|hGLq*>y|rWv<r
z?XPL83@|@dL)S`ZVqqwwD(G3rozeMT$3eh>4Aq+9h_&B0w7%?_I#%ZlN|?N$;?|i0
zxo|$iqXB4AK7&2fkDTUUx97;;hz9_ZlsGIf-LdS;l$}Ra65FvcjG-p&J|hz^0EKBW
zlYX9LvbSn|=jBZM&#yi{D_E?}yBA`|2n4@7XP%?oS54T1=^4EaM0$cHhiZ&l((Z@9
zfqm@U6L9H@-;Z62YDI<DCmMK-*9{p-(3xpu0<sph?32qnmSWFxreOI)4BN8hkEp&l
zH1O-htM?C0KLmf0dgGj{&}4Am3+)6o18QgUxh8qg8!D<&mdqB1d}wfo*w5oh<U4fL
zm|vlHJ_uAl74f=<`0`_RHR)&~){P_2>Fi@6*mW3X&ahgEX6toi&##gr9!(bqtG$k1
z*57(T;^ljJ<5op3`*D*fsQ6k#c}xwKSVH4T7rMb%E8IQX&NGK+hUiUeeC#{!g1dh>
zvW{D}kYR<wvvy1<&Muw&4Lf)fwA9-aLAS!(g;41E8L<<6JadZNjO0TrXV3fKNpdFH
zY2H#ne3(x0jz(>-G5^GYg^k9bbAU?s$3FrxqBba7%l%LSB7YHAS$-d0PFYaGGB0yB
zdhBg5-aMGscGOvx%;QUSgCL8_l3|4Prho0{XhY1FY>~<s%zD*OXVBD+v92oDQ)Ry6
zwt5qV=36p6tICVOng=ts0GwNf@sWo7YmiELkM6R3bJHo>@yKvmQESaD8;gcai$m0=
zUPA8ktvj0!WZ#^RpdNb{oMw&mVTJvM?L7zJ(R8c<gyisB03!o^CwikslsS^=k*h?C
zJU#l!&+0)mA0a9rT*kT2PC9*W$K!`O^6~IwoGe$P1cxsU9M>ImUN8d3U;!@NbZP|K
za*EzXQ7ROPY}z~b?72Cys<=gs7^^ZC=SWI_8p)US`TeCmtaqm*7!k4RKgeTPMh9Sv
z@8o=$Jjl*(()ojA@qMq7$;%VWi?74fMtnTHiESEdqOq`PFz+=37RUI)t^;P!XIO@!
zK$OLLbc-9m_Q;Euw*44dTc7alSPdg>k=dE&vr;f!_;H4~i*<%+jBR!$PQBI8#)#}8
z-WGYh>C_xn=<qgw`s>AU(0&@}6t&;wB&yFRx~7tZt~O>M*-036PA^<$AGl#zU{_l1
z!`VZHMQt-77A+}U+jfrv3K8xl%T%A|lc~)aw~oo5a-4xzUR}EB_tX0>(YMQ*cpe{b
zfMSW}Fpkl8CKPk}9Eb@`dW+BWUm$X(t;Da7zfjm<w>6A4*Y;eKSIS+#o+`r<6X5Wf
z3EonH_ECBzU1kzGSzSX5c~bKlrw+V{WhP=mgOTb$?)?+UqAsAe_bVy<h7q;@_ZF70
z8Ca+_7&;+Jn`LidIa)-UL-|V{zhQ@&kFz%aQ*X-|?&DnD<TYHu02((5<X=<IiNDSn
zfpdUCf#z5aM*#<RNgfc9-$&_Yu7L`pGp69OS$RIIXhk@9oCUJBXk~E8)Ub0;L;N``
z;y?O(C*d4Eu)EF~lkl5AK~t`ted!I(#cIoLTw(S77Wm+Q`2`@?L$buvIZvabk*i%G
zwG7;TZKBkUUnA6=B>S%s|7IO<+7B#rK$|TlB8(-cgZy%F<817y1msWDegCG}pIFwv
ztOEJ3S953~QZh^q`?+aqHoO~TtpB_W&|LhNi$WOSKg7@ly^iqaZ%lkDxIzE4kh#CP
zD5!Umm;d_~{omfA=|D2Hca<f<#m-o7!Hdk$B00$5Fv<r|ZMcwqaH;FZ#e(z8@Arh+
zElEj+@&Dt#rTf$Ut=NAOr;2~W%JNzu9D^I$@TH?j(wP=41XLS0t>MX%&S*S)^42|x
z^7FAr2fqB-hIzqh29uF+kKxti#JB7W%n^4x@~$QWk7V8^nrh7MYcrP+9s!H-8o_1v
z?|<T6mOTXf8WxE@#q{MgV|Qb{kvkkcFo!+Rr>2o2=e5!5RJP8hK}}?V*}9pAgckhH
zl+mZk>W_1>SJR`#($8M__3O^}(!mrY+MUoU0;$ZhB#o(rb}(2tpRp4#Djr*~Uo}a#
zi8W?Rtu3)G=MY6brHx@X+jq|8?4Ettn2W95pw!hh=#+>4#n%@00cSPo+Y1g0zbzDy
z`gH-oon`xwWDu~NDNV_o1F!;X*qeVs5C2#G|4(hh|5?c6-v26(oGXtVnONv@{6!&2
zv8}pA*ShU+w3*o+S63i5wo?g61cO>A!<U_nk+|FE30fX3UJ6-|Jbt6gKhGm*(7H+H
z#yG7#9%GgMY%^s3dbbG-V7CA9kh1;ux%*<Zg(5;SVo$sHiT=X{4DLs^1^<`IsKQnq
zs5R9E_(tnsel+pN4i^B@reo*!SFk`dwIB2PH_XZMZb%sCv@~*KRfZFx_KW)#@g)+{
zJeYxX1Ns@pj4QyTKRj)erEf_WB9rjq7mGgju`cg%v`ekm@O3sLKbnCTaB%8Vt3Vxe
z$tRyfMwSs<Q6t{BlJ6?*>z$~Ii=J{^sHw~NW=U?@Irr%4Ty9^!)-^S)h`S5zJW^YI
zz>1f#OY{J|g>mA$5sJP`wE}uX2|YloO$gW;>dGEM^@=3>lQTbA(RC^E56hUTin
zZ>!`<9Vnlv_5y*;yT4dl|AvJ7m%q`x%n;6SwkA7$8O=>&Qh1}68?ERP)VGZpywMuf
zd=8V%x8()<oIc@?Bp#VNQg`wOW+CH+ZtAc~_?8L!%oG85nyPOJbZ*LOGcge>sJP)H
zb?y3uCt@${OFLG?L)=j}+eu^nk(!%H{L5*1j`MJvAqBPUQo=gJevdhf`BYo4_u)f2
zX+qnqzoh!m4;Rf2#>>kM%I&5-Wwkt37yJG94Ksscp=S(3cCj9D)Tb3M-FT|ReQ?{3
zrH+y%hdA(Q{%~VD*%E{71QDjn1v_zn1Q2TG-)4LV<HxT2?TI3EXvfy;aE8b<XZXlE
z!@B}i1zK%d356^h`d)>@WXp=~3uhmMCVz2xVP+i<&7Ehv(evZcI<(d1(OMkKBUCyo
zXD-N}Om_Zb%q3!UL@>orsrj-w@&Xji*)0Hsj<I4a3A!<dKa763b6Jl(s)J%;o(WEi
zk3x3l6JB|3iDy@w*Rtw_8ht+!438CcStSSRlR6yn-}A7?@icp6B<Ca>Jm!E{i1rA<
zynaCV(M~yd8&P<^L)??IM^IWmiN9iNJ8by|S<B_BV6y~nUW<spPQ5|a)RKt9Ih5=;
zDj{Z4jE;(}$_l<>iNHdpx^{!My~p@MwR(1s?fMdN<jZ)Z>HPH_qhUW+mwlMWJn)sN
z#O2_^kXf@@$i?5y3k#3QeQ(o;`5H7YdC~R6RUzef{t<hYPZ-&)_b|_S`hxJm3OiLE
z1Wssap0n9h?NYjzTZyw{j!)G9%eTn%`u7^dHSy158V1W=q?Iq~TU}5B6U912kILg^
ziS{w{2Le$NXbGr+uG8X>9$m6RX{aR9N^yD%;|mXrXm>?Tv|1Z&>K_e!)(DI7CvStW
zDnh<ZBp%=tR>e8}`eLa>=<75=46461jz1Vy6EU#EXxSpfh2PN5n=2*jDkkb`o;$ws
zcA6pXiaCITy8XLDfVqPiMAi&-pq{|1wI6{`B#%r}s&R=d<7rLt-!KPT;$T#=!sYqp
z=t~Z=lB`!R(`4s({>Xh(+Mc}aW`eN^QRncB(Yrtrti*yd5nwhUPd?-kIK4GfLE}{C
zdtUM!b)ERJxAqyRz>ZG5;Qh3xT(#hG--e6FaIeeHGw*S(-0y!$nwl?3i^p2$J&5<k
zw>(#tw&70OS+H_;qocRL*gb#0Q9{MhXLD-G!=F<&61D^l#FkI<nV@HxC-s*-Lnbu=
zPAaG3`?7E8IA{Q#vm4(&(vf<geO^U=EJ^alLxi^#-1J@L5ffR}hde4=m#sssV~n!$
zgExJ<e_m>z*SO{W^tN|=<73<(S9bJYNmerD+tYap_O?bWLZ9c+-Up95;1^(r^6%Za
ze7^aaA@63*Mn@^HQ$X%jw;Kh=tWF==aVS66`n<hqUX6H3;(faU#FJpf0<lk*K1nWL
z&sX4wxk_%tni}sfJpAFj0{_jH_TJjvk2N|T@)Gl4l2@{nj*sj;Z06FYmN1ZU?8v7b
zH^;gA5HFTb3%+S`A0&DZgy)DgA|N$(vF#;*_V5`>CQj1iK_=p>VoBB49i~oOnIq4B
zwk&s`<L&x(N*SJSS5qC{31~DA|A(pD0fAVQ$B%}K<?J2~QReXH(+~sr4pim9HVS33
zV{|(|()*=Z;c(N@_vo5q2mOC$eL4CJ)rTh`pJt0fL>id{`kWZJbLs0>P>yBQz^}1f
zX63cX)=J~O4EK@qgzj#|5E&9t5-n1KK1xTQ4OebWlQFoubep?=a_xB38PLrrCnOW6
zMuB3-K{|+#^GPTHBTqHFJ!qRPHu2OaCt9&Cc#pR>wVZHG)Kg3~2?@<o$$eYTiN2%c
zV9#EE6RX~3@c);9q3^#MF#N3z|C`8&`x$+R6|)`9_SO5(41n3eB5+1WRp2yp;H94+
zYa$BzI6FDLcG9Ezi3cv%3>nh{T)eXL4(=MJG@P~$n%>OJb;%Z43fcS6CPL)w$vq50
z0oKE6q>ln$_M$1b!7GD1yfg#xd{9%{lAeIA;l*~6di#X6eU|O{mwt)wNJ;8j<?MLO
z5ws2a2?wFd5;tVb4dm7jXuMIS*u`s`enMY;MOtgS+Z*14b$@8;qN?$VXZFr-7)q<n
zanpKBVPKw}ix~pS@k=is{SBj;w;e!e$xktQAYT^-?DiM5=}XVdhpJ9-Q5V|R6hf2!
z$h_Z`WNOw+)Vasm1+2I^MicWX1G)Uzi)pqn8l1R_*72c+>ySoG@%$lSpR#+rZ&ju#
zHr-pRzbB-dee;@tm+ekDz)a)tg9P&s1?mR%khY<92i7l8;Jd~d)CU25a0Jo~A4RTQ
zjR8#eLP)}-S2y8~X(UVcyVLMkGtbhH`_?(py0r7L(ZKwn2fkX5Gs1}_agHMFJmAIf
zvKvI`fKQ9hn8j4vXOHYdbkp|~h@eSPGHL9!QaACv8@$!FFL}sgZ@Iwtq3=Msz40b}
zh{HDm==^8c6&M4wDG|ydSmn2BuudA7xV*l0*UlfaVtCG4MNIj`AC;@_&k9bAt=BGx
zF?rDzkUxmBDxu5F18BM4(njYZNWT>!m+{NqRQ6BDKV0ko6x9AyZ2nCr-Zz7Nk_}KM
zu8Sb{lVKp}ZcAG^cTvNmrR8q&^Fy~&?Fn8lT%$2NZsetVsDGYSeycX>uvsA_(q{D(
zS!I>j#07F?Mrk>_B7mU4y3ivHG#F^d?#iLF<%p5FBK~i$jhV^?9FBL13iN_5a{cf^
zPUT@l=sq1*WAcpSP#(qV>4VA0oAl%B%W@jWJ^AFgJALvk7@4LaYpePhQD+W~qRU{i
zdK=g$m=jQM-`NTpZ!3Jb@=`ig#m~$&U=)*{4+@BSDv_<GKlKjs*mC1_cr4xpGJX-p
z7N>k#iHewEd<AbjVo%7eg>_QXPWtg2M7d_};eEZ!Dsm56T>0<EJpEJX{UuR`lz77Q
zX0U7?U7Lz(2XK2JM`tyHXZWGyPz&3~0bWg%@KlbXY_;t1YtCiO?S)iOQPeGR!|1W_
zNPldl9qa8gM%r&!IjRM}ZS}vg_a^XA_Iv-hk|?Cb7BLk?gbI;uw4jnKB}z;sAt6OZ
zn5#&#PDn~=tVxzhS+b5@$d-MbVUlge%%E{GOaE`@`Q6WRpZmVgdCv3ypXc{K=e$m@
z*Sx%3jccy?Uf=Kc`Yi87_^uXJfgU^#g%L=Z#1EMb2oV=wn^zLRcw{^x(8en4=uzig
z!<GW)9Bx%!mOK;bMZJixX<x&(gx$d=9|mWW-Q8+*%15`r*fIO9-n5NFL(7Rb&P>P1
z->TJ1=2F{g-)&q+j=a<}3*n}A<wt>={R%KL<5(beAGp{v%hVyr0V5xSb#Nv<f%%Y=
zp|^#hR-==L^~9rRChJxV@EV7@d<3Q0)QzW0&g{J8c{)30%4=ovf)#w?vXSVyARvu!
z@6TWOHax7TM(u~Lwr_BOgt!~08XFN(T=|P2(txFF&as>P(Com}n*#Gxm4!-I$+slO
zn_G7}YuD&~>32mP3n&I(j6b0stYiG3X$_HXjJv-SccFhah*$@S7rr=(rQp{O>68x8
zw(VUB4ms=aaq#$ji_P~FBqBfTI5zn}18})^H>+qFcTh-O6qOy4(W8R{YrKqu-=`{C
zQj`}=x$BI;Fno(+u7x$<UdM^~+$XC=;x^&!92m54NhiF|R=<r$j`O*@-=815s5mop
zc4PKR25Ns;-c0+m#T^^0FXgE>sxqeDbjD`3M`R-E?UR13etaoH8$bkzf(EkJ@7jZM
zR63peVt0wNv$wn2V6WV5!ScNi(qH&yDL3+Gim|C+2U<(Z!4pVvNPfNqS>pta%_1ip
zOX4pgZn~{JcqNc`$=~&%m61|yKLgbakOe%5pe2GVbji#`r80x=Jg)jyMP&s+(Z?89
zc1-|H_7No^uXzk_XN!EiouA|tiBe1(U<1)!9{y#oMBOJAi$ojj`suzBR1>g)fviFC
zw~%wMBUi&=a$k0ohBu{~T#;c9U=HVFZum|qFEtSKG4kBch%I^`h38WNwBie{=2!Ig
zkU;D^dqH!hNnK^Eq+5N$&ufiRQ~PJrJYC-N-DVk{JJJ>s#+D)=S2bA4nHrT^&yYCW
z#oYqCF<luKrJiObxA-xs7~0wSr6CVxsalyb9`)ioHriK?4^2P?{8T~u7G@Nu++b&b
z0q27g<6$O9&x~4&;*!yzN^&8xC_!s}C?~-tVSf6}`~2lz?JKT^RX2h%#a6-qDb^S+
z%;b)auH}4$u1$M)$5XJOdOjJ%<L(U^>6<TXP4$UB%#65pi0A${r3b@2k`o4sm{uU9
z;#?&%7*~fZG@+P!rlMc9j7!;pJ;Q#?9z*K)EH5}Lr1jL-t9Hk^O+2}x!LALrzvSPH
zFPyXPWT;X6S!aj2ZI$Aj*Tk_XY<QtsFj08X-+T+aL+^~EpQmO{!p4&xIdc9z3l0L4
zM)TdV)<+R!axc;e;Ka>qQG;sC0D3!BlVu5K)&nip405YdyF;8qFsk%JoN+9}sQGB7
z39tPAbln>6_gzi=(l!Ff<K?TyaM6JGR4Irqu0|A@hkAPo8;w88>>_n}OHX@V4XPPO
z>|O49f9$n;-MdXu!S~}LaqYNZpb<&RL%m1UNQ(?8j5$P+Jf)_%GDcnHhsZK^RwYIL
z?wTfZPv@!~uf^o{ExF{M6nm_=Y?#^NW^WJK^LCI3q;w#y8|+EA(ay(hByQDI6B`KF
z-^JEKJ_<PMG-m6Wr{+`g8v0;&;o^xam@b)j7Bsnse9!pWY!8fMY6%-LqOc%$y<i$!
zofE!DkhBd@W;6t&2fsZZR~EjYtD<hxWt|b^e?oI@9psAr8e?$jWYkBp=OC(}of6MF
zF#~Na=V!TMTJIKtAX)HCO{p9Vo$vCo)X-5acD$YcEmFrXc8L1ix9(2SzLXEqnERZU
zdgocU(Pa&NQpHpiFl`0V>a{1FciNqwzVKx5Rq9gqUVoY4dQs(d*w50&-#(p*iRHBn
zSm7QH6#3LX5IV;H2r#_N`<<B;u&HMtda$CsUPM$_)^<+*L`rGFhpul$wn?{ISoJ)L
zl9L&C5e<YP-+Ck{eUqXML;x_wzYz6^D80oD4)l05Exo%iP^({agT$Od(?J2A)Z^HD
z`%%7N7tkeKLxd6EkA^WW)Q&s$7gValQQ2_zY1W$7ec}@DE|_eL^Y)@9cng;plhztI
z4t<4yHKrLZ3L%2@G-l-XW?TUjy8^W>vlu`QOf25(xw8+rZ>7Y@5<dxSzaPmpr@i34
zJSgp@{Pd*K=>3!h8kIl>yI{Z$xQrgmU6(yBY+je>TVpR$WA+xEp{FiJoy*P15qfvj
zO?d#Te>oOKG*jf=TX*v$*_?R<j-dOG5!j;q)uXF94;S&$(?p?i|BRUqbf$6dBF9Qa
z{d9NErSm!XOKxy~T6=y<+~)hyfqa;dpQKP+15rG{g^r|RSvzN>>=Wv+yhFpTzEYUt
z3UBId<8migm0hu#*Fxg&dq0ue=+&@Rxu3flE@qB1Jfor-seB~DdRVvCvcGbRS<mb4
z@9R8GRY!~H_1jLSlzlijwsdEH8KK4Aa6fPZFdfYD^94`+>O+|Q4*jOP$~)0_tYVMV
z%a}h$jL5CQh|L#QA%iM6agv~`46wIp!Y_Q^!0qF$#kKxP7?xETq0HarKIPoJuJ(<k
zXFJ11qqVKS?AXn#zJJ(_h8xNx5r7>Tp?_%{QAG%ix{){;t`VC5UMp5Iw=U<C@sjne
z?2s8zj&dqpqH4$wx(eD!&T#HF-;Kq=Dgi1vjW<|nF=jyn!;kvkG{=10qomy^^L99J
zc3CimT@|3m(4sgq2qbwT&@gl{Gatu%U<}n4Vc+7!Ty?G$sVpWPo5<}t72U1!ENJ+F
z#lcF-#?Qw4nI`oBplZsD<)lE4o<Lds7Q>pGM#4#9Iuak;RSWxWDSaMg<d~_*ax;!q
zbopA^?B0xv$F8kRf|cns?)os66nZz2c@-$-wF|IM(=nm)Pb;MuYw<Z|uSbWbDfX$+
zD`Le7e!(Fln{*%Y@N6)+&-nnJ>*kTj0&EQ-^wmE4>-^WQ1PQLhOi|@dfKz>?qU1l-
z<y2qxz)|V&K5d}`vxWX1XTpOgl&08C90Kw(6imq1x@RDYj$>Z$E<ceh4W6ErVqd;L
zlkcUej`hm9)jP$`97^|Iu)e>&E+7Bd#-k$zGsMa8F|#6a1re-+uIH8xG`e|>wTo%I
z%Z`zbFG}Eha?@)_E*y^Fu0J|1GE`R2<%3UTVY(co>6yVArk-J#EvvhZcJJ=-@SsW2
zg+4S1trI$)94}abyV*8uGX)F{SqCv+40io2aAW!j?;l7V!~-Qf+~7H1soll8gLX6G
zIxihemD<9&cDuh_p8w7pdmtl&phV$+h3Q|(%K}|3J`^XmpvCU_m7<b}`cZGi3CB36
zMEBve;OK#EK)r4o!KHnG#*E}taQC34nN6|>dN7CprNp7yLH>t&x{hyK9KGJ$YhbQ?
zUmSVxDNpaCWm}i50E?hC_>}lQlR5%C8VBJ$tgGxmfO&gynbb}#D5F<CSN^2&!hgs*
zdW%fUF{ydKbcOczcu`X+fm)|9YUK{@XI!B#WtL?P_c1?k!g3eYM9&A9g*UOS$ILL_
z?8Mz#bq+=YIBWWOG1Il~ouA5XHM&|?t%>vPGUW!HGdRPEDEDHhpP~0r<yd;$28Fs_
z?$&o5FRWl!ukj(fS+{KAliW_S+i~aa4U-S0<AqoG=&s|81&DjfbArlcSbWn48n6$Y
zMHT~%Mp4Wc9jE%7zU;i~i5V|T=hEL_aY>6kpZ=k|xHN!wo8i@ib(iWUnf}#rTp~sd
z)+KLc-*zR@_~1dNysSj{7t^G3ZB6fvO7}Y5YaU*`=JMGs*k?kxhg=;@8>+aE+UG{1
zh2RRzL-e+g%+zbRJKyzrl$)o`*hs}M+ii)HO_Dy$JER9_Q{OBLcnJFr@rJ~{XCt%u
z+7r0i+-ltWr8-Yob!{A-TtB#Lam6G8BRvAAoe9@>wkr1XF(#Bneu}O4jy?9m(j;z+
zyxi7-Shp1<z4759#E<qgrzeDxU2bTUp>tJYzPg^j5^`j_>O|5bi$hmV%(=}InbjN|
z;W`kU{7Sc>PO|n)t*Aw)dE<me(5BWMn*P0A#TQyM`Zl!-%|vS{e?DxbquA(xG-pfu
zj%Nd*y=mW3o%g|Jy{`*GXjGMJS{o1F`Xm(7xaaJFokd6TjfU6W)H(Nr?5wh8MMg!`
z(r)uF?E!=B7mO^2!`X{~zvz|c{#|XLkNmAH6Kaf3W6&CH4n7chQZqk9l^cr0h%I_j
z%^0*JoTr3q?J}6oS39rTd9HOyxSK4pboG^}!7+&W<w^{8>1{E4BR|uNAwdC2@R}E}
z*%*FXLnj<p*Si@mKDVI#c-U>ea3+l3a_3?B<D~hmW`0|9;?uTd3l%$4GDIyM_ILj|
zn=J3odPGl%h9Iwa44OxVf4A5FhO%(?KX`$M$EFTZOr)%$1NRpdx)KzG+eY0LZt7<=
z7n)~V9G5kf_yn5CqkP+z{>bnXD{rl&W9HDJ1^>JcJ(Sr215W_{fJ<G_Wdy8zS0$c0
z^ul}%+tz4)QADif*``LH9n0>TG9NcY8(v#%)*uvFp4is8x8t4n&8_GC*2q5ax9LF@
zh)`zfNakDEIxAokCsj|~ahxc4eYR-QoPZ6uiY=W%yL?5Fyp>w|!@0Ng&agHD57VD5
zeF^MaZmpILvcR(n{o;0je&1XWI{Qc{F5yJw<)Uw*TuFIpM#%cc1AX|1pWWwrSZ&vz
z=+to@rxLUX*UVw1K8icb%&nC^(`BDfEvr|wQ@33JU4x(i{q>8tw?4eG`3OkUtmZ@<
zQRyC7=^Ky53_`=dlNFmBbhqVdejHRxee<6w6G)x-BQTw~HQs82<|LDF?kf1zJo$82
zxm{*c=EaMHHx|ZE?0;%D(-IRsV+05y0$dqa0)ORZ_{`Xb2rrcp+=Bvp<gwycft5G%
zU3;fJqFILapYwx;47X14;E$xn$Iq?$OTC)^tn*dcaxx)zJW#;Y?67m-q_b~lz^I84
zv6u6qmADQgH`(czuZGSfvKPBBa!~adBq?h_;UV%0R>&UHMi9vRWY->sIZz0U1cK)K
zUm_@hijU9S;tX6<ZV*5}yR^_-*Ogu|Ij<@@T%=E9_Jhn23qwGt#Xgi$&(d>=4LHL|
z);p_*7lD;cbq9BHR13HAx461VE<Y==s`3gEnq36+D}bLklg<77YF`g@gE*vIw~2j{
zlg4)S*x92wLYR`;_2QOB>&}C`O~u%`4`UY`K3bdvz^`{&>@CVnkS7P-K_8)=g4HtH
zv(PxZx<rsU`(zmm<jrmYI_AT}@7mUVHGbjf&@xImqFh}!o^hUOA3f;ZeI#5l?~LKE
zqz?0XKjHz1>C-0UY&oj!0lypSa{^nerOH8$16mz708bnE3mM3qH}xS^9O*@@W?y+<
zWL57S_A)TkR*8Rivtj<HyOjCv%cmYdW`)h@*{V?yIEhqYGuB=wb5PjvNT15Cd_sPy
zA${2HbcTFX_xE#FhqE)a*8Qn#*;sXVEg=QQA9>F}K2#kiboofil@ylT53biZ@aC%b
z<!WGVXAqA^1t-47{#3OVA+IavWvm{uhVQdvuG^?6gtCoR>wn84@v2`PK)nr`pRU4<
zoH8IS9y-T)3(SPo$A}H)y#1hRXP!HN#@4Q+dKPqROg|kTwWLfP|1?)&a)d8X6|qSw
zzZ^quf4S}SH!wUwpbz>No4~*PpU4Y>D`CKQY>9e)%$~mnw{fzpIJ7pAiyPN9`EWCe
zS#i+(!1jSPq6#!2Z+yB-YW!0Yc(UyK^U4pX1nsdl1)@3_O)jWQ{w&0zq`3;iTu{Y1
zX{uPcJ>Yz4ofJMSjYE4_F2l>l`}ljOr1^C?Hy4}Ly_=L@b7Q6_N_*k>oBE1PP2B!{
z0Nv9DP|N%vf>+?m6c4c_#!OqxCD@J?557tJ3mR`8aG5iab*DT#u=}tr9jYi^X?`f_
zo?`0nvu@jUF>7NU-g`VUJotMM=_SzLR0Hngr?K~==m64m1|3>2#cAp(w;#{u`Uv~m
zym;<$&EvSmiA^T4a;Fbn`AkgqS+EUn%UOO-6u6ZQ9zeF@{|XPQe*!!DKeg*uF+%^u
zL8AS{6MM9a;6n-5iJ?-+i1L&O|AEwf*AEiE*D!fyKI;>DnoFu$F}fpAwGeb26g6`a
zz-Hpk?Wt6zH((`Ju<PIzfdK2Y2EszKMEp5Zbg1DbL10vq$c>+R`(Q%m3T1cPFxSbs
z3nGnqJ}}#Bpa|R30X?wrW6edhuU%<}!V8(tHd#7Bf$#J~uG(>XA-av*+Jjs<PU!c^
zn7c)|@!XwgY)5&_OQ^~Sve2-xYV1R~&8su_QK0=OdOwnWmbS8=CIMh^MnLj!-U#sr
z$Dq>%XSf}BBK<IBM?d!pVeB15$|5d0j;ew3Qv|41eho+d!7*-<OB(A40jlPI7wWhi
z4tk=XCmWOrbz;!~)3#m(d}yrVA0Aa+@kzx}*K<JJCR-ze{ie*72H^p~5gO|Qf&g2w
z0L49?_6NuC)4k!y0_7$4Mt^}j$Y*%?1_D;{<w!0?faRXr2`vKx)b9=vGA}6p``i8I
ze*P`@PH7zwE2F_m3$I7Cm@<qLam7jF?qU)PTW}Mot_dJHZ$pE(GdF^Xc5w7(ITvP|
z743s1n67l^doN$Aw1=}Vs;v$lu67hqkPR|#DBcO)0}7hefQRO~WdhR+6&m4cqt*v(
zT|_+q%*jkAq@To@vfVQV+MiS_8weS1l@F1XP5m&zXH$zzzBW<<!E`YJ=kUSQugn8;
zbd#X=2))~7pX2IB_a_yOxK8}wc@?2qwZCdo?*)3i{VvDJ`mDvHef&J@eZ)Sy{E~@?
z{n{OHuuqIkTwblf@r_vSxc4Wgz?8SH7aU4nUvw!QDX;<c)&lkF1M7CiY5|PYrNdU*
z+as?KaEM48{jiF{7_CnDC>hV*j2-vLWND*uR<|{o_nkJ+$O?B~on^J|MvYYW@zl<5
zXT@ZFBcu%OX*jLQ5W{|xng!UrAP#$7Q~~i3j#p2Z@l+9YWd;aBuj<T*mw=;QC+F}Y
zT*U%;ro@p$yxjF~HVR_<ed|+LA&lMOCL|#0(N!<joBzG{B{2plZvM{~AC4J<0hb_h
zc|+rZ7!Y^~1|@^jfJrFJfViR;z_ORO{KKO*i?c?EK{23~p#MM@@!5ubP=qa}%Z>C0
zbsyN$g}{yh;%UD<gzC!u^<?=U>`IoL6cm9J+<!EmT@TfQc~gCcFx6NN(6AxmV97TD
zGUe|Nig9#0P5ueKUm%Ew*KC4k8Np1jcLsS+qc&p>!4DVerC4ZO{XusKQ$6XTqG|Um
z<w*b5O{pO$BOY%a8_LMP$Nx7|?(Z_cqPQJ6x;0`5Fs|WSL~&5`MD#D!8%Wb<94@$i
zW4imFh&O6o%obW#wAH@SPf}6$rg|Hnze*-`6<9iW7^}ua)q3^=w=5MU?i|3JkE*_p
z9@UGMY`nJnj%-yHJ{clS5SERF8#&S34L^9g3M(-1G+};2F(4PXvP0DHeg9`O)c~L_
z`f*^{+DFZ4#?j9pv0-DlHSNVz10L91aQbPVYj?eGR(t8UW7@08wb>iaHW+HM8d>`>
zO+R>mM!pmMZ96HGQavV&tR9tq&y$-QiFQTtY~3CILH<jTf$Oeocjnxtt}7?{?9%d6
zHxexXdFELmplQSN?K`wAG>xNy^Cb73xHyXc!LuVtowJNwl$zdCc@ONL+kY2t|E~M`
z+ivV%`Tf_<_R1Z~e=fZ9`rhKt@(bb{cn0xjL(OLbctM>v`y?2-%6!x(F$hKYy=Jck
z|3AFd{jYFve+?M=xBZ{Ze?7eV+%DQ+p|1kC@1tsyM|cbWGjspzm}XwOI#)RB2hWr9
zu1Kafy732(xt29+5_KO(cQ<FJwwwS!f!YS-ieD+7sKfJfmi*C+4c0&~z!XeKY9W}q
zFo*?C_kL%Q0bA*l4v|}uIWGpwfZM17xVZZ~kQ86KFE=T7cB#<}V6uG(oZIGK6<1S#
zRcPDxXLC>a&n2L=IzZBx4oVglf47$d<oknXmr^^oJRQ*GLhkbYS)op}L-sy^hx4+l
zk)-dQ%O^eWL6mq`)QWGf^73l@>=IYx92mgYFh95HT>KB7%)l#z$4Wdu{-G_u0ABC^
zmznrpGYUFF|DILRN4u0Ca(LIAyzP@!eNVT0p`>Bp%i|25h-4Ugb1;UC$Kp0Dx}|IH
z8--t-4l*=x$7>@qcmfGV7_dLqjv`6%h{aow5R#tVh5Q-z`vb*+G;Xx{SSVNQE;x_8
z1^-mM*zyqYTSbGX>shF?i48LP8vLpMNM~pW`bq4{A>=~@Y(0%*#~30~&)Q6Svh~T_
zbs+XJQJ~j?9Hv73KXRDR@}CygQ(w7~(3s6@Xx@ki6K07@BG8wx@GcY1He9U;TWPWV
zXVxMMT6t^UfLJOHz6|0N2cV+UB)V#4*Z=6kG1mirM!HzE8%crMlZI)83Y;H$uy>@P
z=Ot~bDm4<V<4k%BWzyQZ?C&9*%D@CX8;cmtJ2-Esi~QX+;a~pDOox=Y0=HS$*@C^0
zGj33jJXu3U)ebiEHb!XFeJuQledghwAt9aeN_;sj=H_807{9N+?9HHuwdbd@5_g?H
zZ~`?6*3<W4AV7{*hm=K^cGZmXe@VBcd0P6E#pr)-tI91*U-z+mUie<*9c^xHcWm;z
z>P7qnkdpb?-4j57fqC^sGYM6VDggHoFNG#wjCGl07~p{nD%ES6s(eBwT}gF)-OITI
z525!HSZQm>1thNj*thC`d5+=kAn4~%@UEh)n@{ihQW5lK*L3aUUG;Y_UkT&^4z{!H
ztJsdP3B&v})&_POPm-%U7fX+GO?WqAOgy?}U$RIY4nE|`wG8$GkX}MCVUg;Is?_mh
z2j>Du!$fEY0OSDGE_T{lGPe=pho%Qd2?5^3Wqy#T^=WXA+*9rRFV@1U-|%KiDp1^`
zI?k+lXiD?TnZTdV2E$*{tiQ+4Ket}ANw&d_fW>qYbbRr9kc&IaR!#j3N|2wo{NWbN
zB|yL5uaiaoC+hu#slOHe`bY2ke+h~Gzq>JW!Pejg^sI`h5-caSeIMa65|-nPr1Kw5
zpvdaLB{yGT;>W&u$o8E=+@e_wy)w2k=+}->+ITA<|A_mptQ92dHb~3tkdC;1*%e|H
zVbgb0!WQ=7?kXu%ke-SsCu>}=&$Co~Kwx9mw$P-sQ{M6<MAL}LIF>Ep#$?Cf2U*8B
z*h(32W3xUKhXouZpBO_fbP}Bkv$jJb3G!D-+2Jd8OpuWVd1Kme`Qk}FZ_%4aC+i%v
zdd-vY=q^+owUn8Avww8qV&!>@nD{UO>!lm{tcHrjtyiro%s)=Nl3O-!$VR|B`Y71(
zmbyLwrir@U0BzHZ;=0+RhfpPDk4k2XdUrn@F}^FJhP|-zq6S4b1)9!U_b-cc#o%3}
zS!>zSvj=c(h7Fap)Cz%)OqtY5GXKF9oB~w47E^~<Jgz4VuFuY#t7XF8=S(>aXq>^-
z07MIRIkM09D6xPTit>lAH26|`e=s+UKNzB$x`O8K6sU~}aULP39;3Cu)>wpNdsOD9
zMjvMi{`<}U^rJy`cm9cqUIPbp54a)LmP>1#+GoR(9e@@^#pWA9@q+Y_KtIKSEM>RT
zeL<)JAJ#<QpZ5`2hZ=Dy-lVnkWawlX_J{xz-?v`LUQvI%ax6;&(t?V#*qiOR>lMa5
z?=2zukj|ci$TJ||2##f$cX!nww%i#hd10EQdf8SJ9oL4PG(1>v**0)m`MV?29Zb==
z0G(pS>m(NFJyxI86Mn;OMQ!$2Z+9W-mQe}4A$#}1V8`s_jY)NZO?U1;&Xy0=TypYr
zZa@Vg1RxiP$AP^UCXZDh>kVYPH>NXnB9%2v<lcUD+U8?pwe|G+$A)=I1MY_cq||L~
z+-`sO=$+|Z>D3+DD(-U4g7N9Wf%oq|ZI2d^d#zLt0SrYq<VJ*doXb>rWFXm;&356$
z#<NFEYkjQ>X<_ACHX-#^0&*AZZH*7PNI$WBnKdH!gU13|ZJ^7R=e(}mG=-AE@VPx5
zm7`~dsXU!EClmm{NNA9VkM||(&U;rcoxNwQT%T=xmbf>jZ|ffAv_9}?`jiZkqJM2Y
zS*WiHFe!-3MhTRzpXqWnVa25o|1bU&W3#}_PUa4k0^400Q;mV5ihRxG5~NHr(9WmG
zchd4^_?zArTh9zUp1Zkfr^dAg@q0)$0%&j)w%6kZGs!Ucb3IJ0)Yrj-xjrrlI5D=P
zV+-2x@JpfdH5@+9%sw8U<NXkt!V+P6a9(hQF-^#|-sbmEt2As2kiste{caw4*|ay5
zbF~Y3es_GOtGvgFMV?g)z!+JVsnZQzMGbtPfaASa+Ut>zw6Nn?gj-yGEC`|PDt4O)
zP2yCEe&IW+SgrT$vlopkvHT%tQ+!LR&t<B2T>?y`f=_>xaR05x92>X2xe|X=Sx8K7
zJNA^#m_wN_P>|hWP6a%>g5v5=*Gf6@m2xwk*hd(#2jbWoo%qg8dBvH@Z7sg;*|#^}
z+aJ9_Za;UnPnnG4g-yntgT8x)IvBi*INR}k=tPbh<=fV1f4}au6sD3F)%J>_an7Oh
zH*v}(D@K5X;DXuPaijdqSj0#C;)t=poV&-ROAIriw_9Z-Z(2VI5}?|NvHPKF(MnaA
z%(RBb$nEgXzD9Rr5O~5iMYH3`kim?hZ)T>|zMh5d?;H27coRw*myRl~+Gz!)uPldE
z*MF=GO-||sx%t`1e~H@F|7UXd9|fU*bnG8IFvU+SF?)MFAmH1nvUCr5?S$aP06k8^
zq8Q^_g$VoIN5;8aR<7zEex(KhB!nnVtc>Rvs_(sS-?lmumyDmd&0V+sOTam>56H2c
zuRoHdjbrQeuPC2y+);oN!M(e%9gv95Uz00twMR<@d{2mpcXq+Oukh2bGz#2P_1~v0
z7eUpkj_`2MCY)N%_ZE&gS=i?Z90g7#G9_ud$w+yjabq$r{;mbTyOBlDcQdtG=H{7F
z&gX;Tk5#9ymH33qqt0F1VYt#`f3(g)=8OCGZcX<iy}hH;vD!|FSEJ-zG2?0bS4BVq
zGX558TYps)`z;C$5ka8aY*iy%OL83JXNxJ+Z2F|e<wL(3c5C3<HA&AaXmNd0ch>Qy
z#iuC|<82$<R?@pPm4zKo7xye?#g6yRXe4fR+z~^*x&D1j^_ojTx;A7Y)scR(g&7A-
zh;{_5stmg(t43$0WYpzoOKdq(1K4W!O!Gs(0Eej61_svwFp9ldgyb9bec)8-tAu3i
z1_zHrai2SO1s*b;Orpt@9#V)a+g-l5E$MZ7Y0@*_xqi+=u%CVYH?#7OR^(qh_7A;U
zSO4INQmw2eQ7<fdeI01~!L!u71@+mNz47RAcP*f3L;qm$at>$v5EQ7A2o;*tuV2bA
z@_mbAWH(eXTHWS7uYu2#Y3x$kPQr5_3HB8KQzmy!Ds1Vxedu7koB!+2gpsFT=3GXg
z)%zV@*|1HfZjT#?Vrl>ZbMEEkzfc3_Cpl9^(gQ4E5E%2t6mXvJfM1XieDJY=LlJgv
z=0Y>?ZP%8WhIL`q+QdW=RZn-?7Y4KMavmT(@IbtH%^XHymbkeR5pc=<j5&)xY(z~{
z<^Z!{WG)t+7auVB=s|yT?Ul(xJT?!x9mEY71-3Z|AgF`r+hc0cd6lv=5%j#R(V5P!
z!e1ZCm*Tt~E)sW3N8gRKnOdsygcWe1+^uZaVonBkKhW_+h#71j7-(^H&YGP+_RiKw
z+vxH6!J4pby1n}(*`IIS3Op3+UgoQo;yX@bi%v36!zyplA8JMwDtkQ3kB$kS)R%SN
zV%oL4yjx98zQC0FG$wgyXKq=j-`2D!9re{`Wr<`3Dvfn0K!A>Y5MV|g$~D95`MMOB
ziic)SCrFgGel@so{Og4yk)L*&NVmirqSw4!4?8lq(49jvz?aIw^x-yT<cEB3u{W<>
z36RJ*oT%)o@kxekJoA<!TC_hYO`p&2bSa_GARmnMI~eCo_qV1OC5%f$s{t_~YCK!_
zMMj*_CpTQa-+?+(#8U$=i<a10<=}*NWmH@BRZLRAc1|VaOo)Pn0f|((8dNFw<3cAr
zA9?Gxs=5W<st>1G`)}5(xwv+od3C=r&%|*=IFW9GmBzFn3lthqj|ifeU1+NXYhXR7
zwiONrmZIV*>1{@@UTkjMc92foH{h=4b^Oy#vsfh_T~*{+10GBZu8<kRx^WTH2NkG=
z>#2ErviBfDob%~D@gu3jDiuv7LFWxR<?od{4fygVJ#z>uSKtcVr2$dv5{AkFP(%|%
z)xQKK=!EO~lZwR8Ixc8@v<ksl&=Y9z)T1FMGrNnuY#C1QxQNnh-L3M$`yWSZoMK+`
zhG5njIE=?}*UR#=b^J*H<oKq&mc~&m!oM`&gPMzpq67+Jyq%+XcLiuy$O?er2cT=j
z_pLv*eKW*+l4zjD;O)G~?+J7dz{D;^#+z9TMH&WkV%Tr5Q`G-bYi`$lcFC8+?7bor
zs3eH)9vFvq1N5{qdXIJ?J+y%!?ZSMcBfrml3!iMTs@tN~8sdTNbB6e~cKG0zMW730
z1-CFZ?0?}Y%JyVbW?v_#)oOISksDHSYih+vCTV+QF6tYJ+5j>Be}$IKKRlNDudB)Y
zZB4^p&heL<hwr^lSk!NCepfBq!_EVhg6&UHAwO3O<@Q_AaN3x8z5N|se<!Z6+6Ab1
z&;VIFdNl<a#?L$@B;W@dO<yp#g1I|`HYeX8Q!05l*=(Ik(eBsNqOz(T25xIH_XAjW
z<{jHduRR6`lRKA`Zk<DU#k-91&*q9+qK@DEZJ6`R?+EP{mNHvDfteM-{IqXGM5;a+
zs?qwku6@7^`|Qi@4f_eIm`f&-<CW2=;V0zz+rfeb78nl^>kL$37TGtLTx#z-ZrzhH
zTF`asYFPUF`|jiOal9?E7b5R>FI+@Y53x3}*Kl$?G%BERql}5rFmFEPbTrneG!SXq
zRjvEz;kdz_%_sJMM9*WWfnG+c)^4`W&CmyD%6<AQ^GbK2`6jgE1z;YcnRHD%rL4Q$
zH0R>=xN?iw=J32@`|nX&pG4U{-JVotI8C@l1EfA@N*1ZYu?n>=trTnCr&&hZXfIe2
zNn?&2b$)sF@jl#6szcABrl?}ZdtNee6{Z%+gW1Myks?w&lP}mek=!vdCr4*SzmZ#C
zH23)W&Ag9^S&IJrHP`DEsFMWU3E0F)pafoKiNfvY7_zxzxvI9iKXm8$@c2BEFT_#B
zDGAD(?_Iv3grTkCu@VlPHQE+4SbZBu$AzZQu^|P-^>)L<4vnub)Lt)^@R0xb<j8&A
zPgRP}H=Lr^nqRi(6SWrjOBpDgHh7z?8A_Q3aHs`xZ`#g!L~w!d(Q`*ego@s&*AeK4
zR=-)&i~~66m4J2KK#dfZo2*6%!a-_kXZtkHG4#hgD_ilcyM78U4%%Le5|}x-(2BVQ
zX5nSjSx&MB5uj*KL2FLU=3qJu_7voIv!B)u!EPtL%ep6->%+rKs`eV<XMB%!dgFzs
ziZC)A?&N~}cR{*t=$#h%^`a-mHl4jGc9w@{O1b9qjKg)ol!4Hw_|}t(spYlF{}M^$
zKMTb6@6zS?%X?TMzD)#HnxyE1{x1(Fy!-^+<nT*Z?!K<Cp>?t%-Sc9Ah_rF~L6h@=
zd{$ZX6c7h!KO|{0#SvY_UWTd<5^j71qQ=O9y!Hm!-9<Vk*Z_6CSF((}uzdqwuic7}
zWl1t3O7Ay)tLN4_(v(*%Q8CtFikcIyd4>T2j!>$c8e5!~sopWC86{^Ku4^78yP+#V
zE!)*X$Ry`NeHF_+_Uk9)MC$E%F1?rtVon3C0A)^ErI-sr+F-rgIqr6P1A*t``--AN
z=Nx9LNNFj{4hL5~rW@kZtH-Tv-Ac8Ux+X9tZvnTy|LT9Avi_G!+i%}17CH`Xfp1qE
zND~;?NVSwMjU=T})3TP5p&J&5*FW`j9I!sTyI7eXI$o4Q7J*eDsw((?&Xfs%FHe>}
zT8bk*PK%_xOutajv9V>0YT(3TJwjZM*}y|iy?;<_w&BU4=Qo4n-W!0uKIi~YkawnL
zvQDBGeyU~Ps&SJ+phDOPdYoCQJ>G51?T`o#b2eI*d-LS04ILvE4Q?wAdAF-aH_{tx
zY3p1uymYfj9fzpY!K0TiJA0JvC@v~Kyv;H-I{Z<JA%)M|XJN~WCoi8*?Q;4#BFEeQ
zjLr4WtN7RZkIwJ<C7$?u{QN8H8Px7qMThN_{JJXsA#mQre=}tQl1`8><_nR28W$-=
zq=JXan$$0H42gSeBlYW`cX<%e^w6)gf*Z^}xTM|WcJrXtG0DL&j<%)Ex{n4fVd*{3
zVI!6_ca-?Sm)bh!R)HeD&TGnbp<L-@Yh@`6Xy)$pcE`$$q_L+9x%MPcg?qQ_17A4k
zvwXl(mqI$>hHM(eiHJqlLPQ9(RijEc6k9H%)~{)C_OemOsa=DY`0CQ`RKMHjpu7-j
z3rbluirl(FphWzG$Bgq<Pt2*239f%c124K)$zT!KS(LpxI_(+@mFo0;IC>!X%WEwC
zkV2^27!oyw2Qn|Kp=vaN^m1^ta+DXY9~ZVG0X7w>X4nN|&^cG&_6(7-VM3O#n#+z@
z#8<FoSl2NP#EsOav?`=HTOzliHuy5We;AtDvJK(UGi98!i(YVr)OU0`QMnm87QhCS
zgp@u*@V$PzO>Rr$O0-uIR%E<I{^S<fT$W<{{qEoLMquKS-RWQnlB7uzinF1V|4$~A
z69{@Gh7TaCW&sHDr-5X&rMI}gL-;%vMj=4~Valfi`6sW*MnwRaS)i+NL+yG0!_o3q
zL~{PjqRkViW*L?Mwv>7I|HJ<hcr>fw?n8?@Q;0v=SCXID^I>gL%YS?2zyEHjQ+w}L
zi2cDO{q~|I4m<Wt;eUTczrW~GL0P<!-|w#Ich_vx@&EHMhRz}jh?KsumNWk2h)1s^
z-h~*SO3<lKKhEEivAS{(TmiE9Y%nQp14Kgr?q(O<Ge*5Ifwn_!I61i3MjSy5%<AwC
zeFOS=VyfwmWyeKP$1rI&n;3HAs9HksEB<bSU2NOFy#ZQdR3+pAnq~JjJAbrIk^Q5s
z`eG)Z8$;GGQ0<dc&>rW@1+J~wAT6Xc<b)q6s~2|=)OxCO!$2JlTh+1n?Zuhr?$pH5
zcBD(`Q8{el&a)yF#bPUm>Hb^+Yqf@UDOZ#%fs}{Xm<ccBP0tly@O==gMgeD?j@|TF
z;`ABr6B})nLp!PfIF?`RM$Jo&=$t`O>0x-KuK_Bgu_Xj8aAOeV`bt6#H1O(#LCd&W
zSi;T1Jp8z+S6xw}kSedd>e?pNN82{*td07?15k_uSf(Q^$C3%y*a1011H{9@wUwfD
z%4%slTa&$a%ij}5W6w)}i#1+;9&P6Dk&rQ+DEr(!)#YjD`GNCwkL`x4g0xe^_la%V
zWVrq<5BSHs8s8Uoq2R4)qRGWesX0O6iaFsw^Y=r)#?#nnfm`v(8Nk&r=A5;qCq0uv
zDs&uRAOBl2-QWD4axRT~(i+F{12E5&^^5C&jmX`TtPjqU&c7T4RnHLTpAdP#I4FG~
zEYfmp=J`;4G`1{25bZIei*RCdl>y>&r5@UTcGbUOkG}`t^0t9>Ap<a4X~$%x;QBFp
zM1A6|SfjiTj>2r7R|NT~**1OUg|Y?${U{cex2Y4H1nDEo0M`3FDL>c`)aq6bH}I2`
zzh4RGVFP62qq_uB1h6jf{AspbIY)pw2!@C`dlSTsV$~9sMgYPEfgb>^ZU&CKV;U@H
zH6VLVK>Ko8r2vnz2-WE$=|(?zR<vjUhfefGu`z^eipe4k5FDDx6#&dFW6l_nH8=Z%
z2ipmPWDqD<gROUpSg!knCvXH}tZ<$IODT}#kIUlVoj-Uu(o3Nbtqjjany`<8w(}Z5
z&q%}2X`ChSplk%m@h_JR{cl|MH*+SKnL(GpRSH6+Waxtrjw(k&4J4fG*y~YfW^)qQ
zUVJFe-W~6&l6KJXqmuP!yU-)*(LKz}Kf2f^Bxw-0Z1fbw5{{vf(Bgs*s?vMaPnEqt
z7zgC<1_1Q|aJ9+EKREN$&-?b<0l<8=G(<!K<@Vp+zO+BMao<_{0qRs8G@pKV|FFNm
zbA6P0(ZBt=9jGifFo3p%D~SHx-w=XqH-^oS-sQXe{d&8SYX)Ce-1&SUTju3fpOJy|
z{z@3%66MQX4`v@<J}0N0eR;BqupvN%UeUM-oJ9=pFnKn1_rz5*)!JtW8F6!c{YLja
zUtTVTZ;7?T-F8LN4FSUJTG*}PJID{TkQTgL+lkkRC$aB1%5$E<xO{V<Wu@0HMyn!(
zfK4{4TW%eYo)#}EmRw5It_x*c=R(&wQ4r}JYD@l`fZLoB%_IXGR<DB~y)ZgX2%_Y$
zYCp<E924nH1B&uanEuA6oR3^}=fSytkrpSPy_FS3?8u68k*vQ?$AG}RmISv8Ck45v
z@fFwNqJSuBJx<ic?6y1lhR-_cwR`QZJ6i;)d!jl&y=|E?#9Jchr*K0XL4jDfk%0qZ
zNGhO@9gh6q#uV}7T}e+R#J9tDP}EqQ2C^8@U{U*eHv1alX>+2;fI3(3Fg?j^sR}pb
zG3lV(WzJ}SK%u2P-q7kDJ)cZDX@t9Lu9{bhIm+xnW(_JYzwAVk-XQ5V(16B$)?G~7
zif0{BsJd=+53GM@!F-#G0c%<!cd|tP#QB|wk<+!`Yngk&0(r)*Cl+(#yu5(7Puq}H
zPXQe&&`>UPQ@K)~@bEn4^|}L8J@NH6`m;~HGlb=HT+&}Pj(S@f8UCx}Q62xzqyFy@
zLyjDYYONAuH)FuVgKmW#gt+S^7`pbHScvp^Fe)#EL|uQUE5M2IYD|x1cV5TTH92#n
zq*Sn%{H*Kk#*-(`0c2bzs=%b)bEu2?yadc#B)91<WrhEt21}z1sAF-ox!lInC5W;P
z>eZ=YtuSu+<U~l(OYDAS8Zij9rc#XChFrgh3WC-w+Cd_)V{PxOyOWd0zPD-F`)B44
zEhi1_{d_`miuBH*i-@bad=9VrX!$ntAow76ZVRBhsJvV$u5CT&?%$A@a$wd#ov{Wt
zD8<dvou4=J*?s<~Y`{lm<{Oo#NdKNYWrZi|vyTTI?L45qY#qMjC?@8TEf-Zb2kH+N
zLE+6kKCN30hj?hVeA=6K=j^#xMv>wkJbaJ7zT(O9EVNfMeXkRpd0VWoG>^PDZ`11g
zFTau;5g9x#mX=5Lv^It1j@XMF$&_2na^YDMZ6dZzU|j)TUIb^<;E-(roSj)XS&NO3
zQr<hLfMtU$<x(%@HFnMu-7NU1@ks2gwMPzIP2_A(eQgn$u=i>3=^;cS;XNQ`ssoR!
zPDX?w=K(~*#|#X(t&lkp)i}|;7Kc;N86EfDjR>vzK-ivlX2MwEUWq-ivj?Ni?IL_Y
zRO5&ENv}W9`N?guiG<L?XH`zfy}ezTE7=jgv!b1{mVGf@bFm_y!A(Db9g-%SR1>2H
zaqP{4OiK4ePr&Ul|1DVo`V6Obc59`0gi_gy3hFbfxD7dP>o_&7B*%T2mQifYoiEv~
zX~<-iU%|?M8wQUB3WIq0pMd}KhlcO}xB$H?a75iJ8T!Mp+vA?%s3F(8?y-Ie5;nJT
zHXaKuD|`G_ziIWogKzMS$S52g<q53CtEf)p%2@#yyUNXS`RDu0=s8xi#T!N+?TtO_
zS)Dz32(07EB?pbCcAf)WOk22-Y`#2p+bgX$b?~IFOy38&`?QH3;|-|wVqcIc;mXU6
zfN3%UA_`RRw~;Yyzr#04k)zr0TywK|{pRLNNZMPlorHqtYPBOY)vOWogU4ql2z28M
zGkf0t;E`j6F=OVSV~aKm`Jmv71dADzOgf)TBjH;r(6XHg8;EUj1wjYZ_za+UjeRNv
zWHonWwl&veiN64Qdqp9Ykc5T(fcAj67B?~z<U_SFAeLz!M2<~@8Es2kIgq3PJa~-o
zKX{Z~RtiyHp_~54myoG<m6t&)_dhSp8^=B*PGIK(8W04&3`if-_I!cpUjg5Hz6krT
z^Lgn&S3(Wmd(qJ!Q@3KbY5*gD0gRJbLEr`aJo2M{x<3w-rrBSIpz6y=((6)DGFKcy
zQNat|#5i~8*<XIUPqvxK99X$l>ShjWhmx=luEin;ZAef(Erec$^!LkxooF|@WIx#I
zSMkp9e@Q*-EHkdTs}t7|BDS2uIs}1PKzc?fyxQqXTx+0#zD~1z;W1*sq<YV`wpJY$
z?t4NZQr>8;TojO#%eZDy@P9l?z`o4$^Y!0J0Q{xq)~}DVg+L|hIy51E!JHE<yC3jU
zTYE6eeI#eL^ckSc{r(F?xyitCUwLFNwf&buF?$(I&2-G*J8l*79e~adTxO-AVmPR6
zz#GgiTfPID--<Ccf{H<_5;(JKowmhW@nM2(7VN-WvhH!@yhf6;aUT>YY6`SP<eI@5
zghYUeNA3QS_bSfmbB!4_ZC<kl%<PN}eu_J^oNj|j9^}Yv)FS5NDb>znAT6{n`oNHD
z(Atc~HQxQ)`mOy9I@pp!t#(C5T~?Veh90T1S2qxjx};X-ZL~bOe?ZpFx0`$9XCox)
z6)y}bc>IDcI5CWxn`s)_u69sAWaMktq4~}&W{x3;my%wr5<7hD3eQ&j@DHAT7I%Fk
zOAIqCt5pg+)6G#I%{SQ{TgZFycCg~fLOp`$;_HNRW+>`Dxn;AhZ`_1xGKO3W_i;B^
z(NUpnKPUG$u3zk2H}o!r;{}^!?z_zA&cD=_<j-mp`@$VfRC}<9fYoReRyupr`{2`z
zop$|2<%!S=a&FM{;cBNxmyiQmTjKf1SB36S8+X1;{o327+Jqw*Y~+*>oS|))ULc?H
zNKg08$O(pEO%ZBPDkSZE^J95w6`b-RluGo_n~(f@xTDrLke+G-WoJ0qKFI#k#xtdm
ztV^%eb&01$$7go+o<c{i8TeUBZ_LrQI*rEHwhxSQ`@mQQCmHOsoe6DDxJYk|xl(cJ
zi=fQTi>k=}pfig{&qF-mo`}F{+zqyjk)=?)%6Dm}4z*-gl@TXwz@2x>c{XxN#sU;{
zu9!_`Zb&l*h3P~VA}F3LAl6WQ#>f3O^F2eh*ozJ{;7{LozFxdZci2Q~sBW|Tsrqjh
z^<vfE9SweX!^>WaEebKSI5Auu@MnjC-?H?$Z1LM#mJWM&p{z^~WxHMcfZuCQ>Yh8s
zALHAmsLAJzZ56Mb336LJ$~*^<LUTycczfjCT@CThyW%Tfpc!_B<=5mdi)Hbh@#r`_
zWtf8+)T$Ykgy2IFW2b=+eaidNPOB5t&+8^mQZK*V-1cm+?(sT#Ai6QW9O&dZ`>6uS
z%ao-t(>TEp={YigsdiSm)pZdP7<N7Q1$HzWT)I%mDg1QaI(8GQ5m~|4KXzvOW^J)q
z5R5glMNODzIAv$73OG2;rP3a63n}0-SXu0^%<`63Wj-|2GARk79r+|lmDdtu&vOS8
z2(r7uyk}17@YlAg*ZAJ8z&@m=(gQQ2%V`w(o&2{t(l;BfS>7y=;3eI7gtdPOM=55B
za7T5t0EW_(11LK#{+>Lr`>sxp0*>1$x?2nXU`p&L(C4InbF~lqXd7`T07G${8!wf!
zy4Y)Klp=o4Ae)iE7Ej}@TOgV7Hzdvx#>>D%=E0V8p+^ZtQUT-Rb_0n{XN#laPrWs0
z-{CBIjOS`pv&a;o#ykvzfII0QYH<?mT{3kS%)`Bh+|Si}kmsFQn!{IYBVM2W@~Jw%
zUyA!PDvkh2rNI{VDe&zcb2|-?aB#}ttPVEhyK??(=})~b)WJEXQ%sMf*s%4TAQPUK
zHs4DwUv|u}J)AN9IHRJlz)f_&af+382cNZt_Rk{ZJ>`=Jp2YX3$99U(&f~f3TUm$D
zUE`>#3MBIh%#%7q=C3Lv!lwdupd+52&#Ce+lZzC8II~8HF)3snsQlgPWo~}(rEPcg
zl_vSLKURcDID6kUrv$Ukz#_~cP9j$l(*p@dYv83_!!SD(ZrfdS-{W=Tt5@ZyPpm|v
z^kZ^3OPuf=(6WXJ^j2b_1XUSSjyipvzUZwRhuLmpiss7VA>$wQ7S=$EZKL*CRV>PP
zj+RYxH&%HfP~?Pea%#BJ+Fv&GzsJx29kSs+38ve!yHG{fxGS^gyC1orpfD;U(e;_U
zS3~fw=g~YCI&oE^tL|2+b3T9q8vz7Z>@7=GrVM^2o>37RLi#AolveQ6wIuj(1?`U4
zn(c2}rS!r^&dahDb12{rn62fQ4nk3AU7#KF3iwUU_1uNRSMhs3FeDmet{)hc?2^36
zn7#*nGM<1QyxH%}|E&F)%Fj%Uev)_}w+6GliDeStMyP~a8Lz&V4-ur{@`#p?Uz@bw
z-~S-|()Fcreq?ghxz%O+;5|KkSDkd;?iL4yX};~?qww6}&SxMsB8D7<)&vOq)yZnY
zNm~Q0T<p?|5;aVe8@38p?|(NX>=yvhjc}_1ZuXI2(D>j}gn<R+pcX2MJyFo7xmANG
zGIacIMBst?wBvoqnw&SaZym!qIr90#!*M4Z>i6;~dY7~+eQ|!o-o{BpI->^2MzS)n
z2+54My4QA9YUd)ncZpo~tkV?7d=~3ARCXm?Iaj{Gc*lRIy7;;@CnCSWBxU+^>OBKp
zP6|ZAA@cdDcAiRLKm`MI$L+!rTq~b;Ug{Qw^8Gz0>=oFnEm1<C(z`B{WyWo2(=>;7
z;iy~2q1Ds$fF=@dE0QfgDslPcvX=>x=6;qW+<DEM$+Dr^i$$H%Kg70FRey2^sm2C%
zUcL%WGU2M-5F`v$qYQ+HVYTC~n?+#TN`cL@4q|)zSLMlB^qn(&6|wzx<5v6>FNxUD
z{s2*ctgX-;CsyG%PgknY+upvQYN{xWsk(oaY4|F?)*&)fC^Kl4BtFXX#tk3ctpydQ
zf3iXxU0y%Dy4ghDV_);9s_Z@6HW~{6i!e}JtX2hboCau?1t%RM#meqSuF<>g9WYDV
zF!q{$t#tO2X)7}RLGfV?<!9BpqBaSry>1s!UY$ZFClEdQV(@JiKRAVx1}+j3BaB55
zJZS5St3|NA$&U*Br&o2d4Z(J*vS|0^&c!{m6CSGkBWi_AF4^YEc-IG=Bflh=ztYsO
z&;7N_g20B}zp)V`QU#!tnOsXJ@JQ$%BO^F^BQ~@oNP<8=hJ$yRKY)O=`#D5pDWSnL
z;+4n?3x;#}6&=(A_GQ1cX-mOCtMPq~g%2K_b1!>|Ew4-D+yu$SCbmAvE222bNcz|e
zl5B`5J!>=O$=1e>(Ky>Ye+trsidg>OkwT8uK_bu;8FCR}Yv0Af9sG0gUc`+c{{eUS
z0WZM3gxE3nk9$&BI0}}!v7wzhY8QxW-!#U?t9x+LWlHKXYEbPXF3_{=Y~Xev;G5qO
zj2sA@pn4;BBW!9|$*=)%7grqi0_OQ^yCxrBA1{WqMYQDh5YzJL`g_(!Yhuwn7W^Ne
zYIRVujKlOg>|iKRUa+jD_OmrO=?2p5R|@pV$YZ|eNcR$>pS7KM)n?IrD<jfB!E2E5
z0prh>*FtPUl1QF|w8jMLH~hVFWfHW`wiu-<e6r*E0mVhmynX<vv#yx{iyhUE86?uJ
zkRssnP8&#}hsJdSBw_P?g>>wra_79^nM7KoN3Mh6yj@|Hh-aG9$iYqA$2l#L>Qq)X
z^L(s6f-C67@<(6!iY>U<P9Nv)pOs*0l(5Tt#0Kth^`N|m)|G296(Fry3|+Ns$2tX7
zBcQ7Qq&CD1>VaHnkssX7%<4RimwmqNV1lV9U-r?B;mVKlw?aDmv{Y?hQ=E=VrvFqE
zBt1sGpQIoe>XvZlm|zmBUUqNA`5nPleb-LB`m)aJ<FQR!mav^)Z^UY^@@t<-62rX@
zS<Zbk$3*T#dZ@v2!2Fx#&At#y4vq9tajbQa7mqoAu{~YyuxeVUu29%c#HgaKZ_t6{
z)f?98LIc^{b<(Wem?l)gt^}$W-OR@fY5A@LAb7qspCHE$8eKgbJB}JzC+l}`?#{4;
z*VvJJY2R0e>pxNRzb@gASQ`N1#~?qp6zs;j4rekcUo>UTc}k$)uXCSscZyYeXB_*e
zn{rr|9TMpS(ce~TfOmTbh%(x#J!1q|Q8p{<dMDgiD<0G~_~p<QDgV@3ueJ6<4%hAs
z7n^LmzlbNeWH!lG-zJF+z2L6PWvN5ehTg4v+0KP!@B1{=5>EFnW*w90IMP~@q8m>}
zKU;Ws#Jc22iTiLS#SGl@8rYn*8D6r0vq5Gfi7u`m-PxU!U=|yA+saj?KF7Uequ~qd
za9rtC)h+Pqmx&yuRIIb~f}I87C<#BH#$89Iy8*45Ofn5@1RDxf&H-KW2;`vVE<)_?
zpo0qzS9Y9@5|GBh&p(PS<%(Jx?@`_X`md)%nngkx!|)dF1`T@a_s}^icYA<TMPaF6
zRjj>)#eyX^YiJ+BNQ|S^w{WmH<~oXglTd9hOMLF#9)eLSeIXM&{%usFy*s5IIu*a5
z-NTxe4ME%oC%*U^@Yw#r6>a{}y~F^d@Di{uk=;f|KJ*dU3OB3>tDpGdo>5*=UgEXw
zAd|kwPyUYf`Zvk3<~-|pz8R>n?O<2B^jKaYBa|##>^jyiguc^5)08p(UU$>3R&wGQ
z_uhsRRXnkLkg|gU;FEz6m5<8U(ZfEMm{~*X9~`yt<{;FZ@BxQyhr;rjA2SCX9VZ@L
z`c%<$D128>xfggl4JL~;V8ie9=^!Z~B$ZRe!R-C}_9LTxl+P)1vfd~lruM%xIHpp2
z_tsYo#m>&|yq$@YlT@vcVt%6(&)Y|7tIlo!kvcz*O?FV#!i5a8Qn|!jJO6y`_J@!V
zMm9i_F8iEL7&C})9wTm;>J->pFe~s>5N&bitOdV*yVc!3A0sj9w);lfaxsthd~6i0
z`i~MudFU}tGKLSGFdn~D1*Sh-Yf{O0`c!gc>aoMIrH%(fW!~H8q<vM#unBV7>HK)h
zi-d5t7YVz{1)|5nv<qE}`sx9eoDY_z;a`XWU-!CwJK6mFmf>OCv&Ve92|vlARtu-P
z%B^!<SL_=KwF|pa%S*#YXbN&S!};!<QKqh;7@l7170xr`l6p^f{UuQwwJ}H<R%qNe
zGK@xC==%PKqUz8*UBG&i_f+s%e&tr7Q`PdPqCWLIk$0B<u>_T3Y6-L%O|F&|#+el|
zq6;&pH_u8~_Pb68`iOn7A8Ee%*ihc-Ky%|N$5Z(fMcy~5pg%t0IXKHy<KU3)K!^cH
z&%q$4Qn||4)Tc`y2!Fk<W8X&~K0mD1{{OJ|-ce0OZJ%fm6{H9#9U&?RNL2(;T12EM
zY6KKe(5Q%%2$7Bw5=7}81Oycn1Vl=tOG~5+h)9PZBm_|@k%STgDb9A<JM(_;%(`=D
z)?MGaYyNO8mnJ9YJp1gupZz?)A{2W{5qkKe&>J6w0;UH+yM!gmad%bG-7GoLSn|7v
zEp)y|dc6_vnIZJZD-#mk^9nD^6_TfTS}$H4<0>XS!_+#YqrI7mv=T-V)1eHwZcgS+
zGNalQ_SYRBtJPOe4=|Rono}~1Oc;)|;CULp!}s?6ly^;i9>YFsFvtAVllfO~-Td@X
zLNVENJ~kAqO?nTdUZ*>8cRgpBx#!ZX;gf!j^<x-?;FN6WW6-Exo>ab!XngHy>o0Xw
zNVD+;%31qDBlfNA=ETr_bfu-QN6CLYJN@q$!3YurAhvoK)rn)O5U81X*MRD+*bs&D
z%;;Xm!@F6tRP8Szxgv{|=rV=c(oVgatoH{pbT2(#mO9DL^Mmj&1je|MSTg9m(=NlK
zxqO@$_6`;=w&jGsFfcmnB8f9`9R@Gx3noY=wk=z3(Lo5ce%5RHI6pmQ&LVZEVS4?P
z>{-sB7BwGpEYW8$O#bhBdU;(K1|L~c8LV;qtAw49mo0PI$k$?4sI(~j&3#kpgXK1-
zq`6QU<ME89Kz}P&5_lU7`7>>&M@L8XVx@1z?>KYjlEjr$;Jmzzkgc6?%?$H}MDw|}
zShwr@FH*H-Yeei}r@5<o1i^3YU&iMq_<guwcLg%Fgl!;i!A9KB7{a<GZO4$X0!s?4
z4r~=7#*2OJR+xyv?CCU7&)S6$zOWb(=*lOB@Y=TF@5M5EPfQ;-`I+}2DH1D&mZxK9
zMjz7=GjmH1Gageen2_v-e&?M$>s6k7R!-zyN|$%~<k*OU`(Nm#Lnp?FQyvu#^F}~0
zz`ZPyQ^MU_#%ntUtHn&rane{u=g_#S2bw=nE8LA2&;2$osxS3$ePHAwf)rj24G<pT
zWoSV<^ke_-TtFMJr3K>h-7{p`5njW0a`@+-ktx#p4=NesQ72>+4N~VE*7n(KWzww~
z-;&yZb5#^LHo4@6<E24jx!Q3pjD?E1rcSDu3`U)!`uVA(=v0Hkxds(TB9^3oSzgL#
zAz;)4v&*qSVvB7yGfb<C#)t=}ugR6T;#s~^lkYWA!B1MeKc&6P;J-wb-mL$~{sRGb
zXx#FErr(t-O}IlKDi~g7&bZ=4M3J)fp{qb7Y&h8(6_~33d7ej8CIMshSeu7<K4@zS
zRCKC}*pg{hV@q8ruK{a7G^2C=@~f+Qq}>U><}mcTRXnD%+qB-?KXB3BRz9PPsG!R!
zbvnhVX3Mc;m`-hNj?H6hhRb3|*_38c<&;~Vt*S|>2a@}g29qD@cZ3Dq2|LBwLjay=
zFE!BjIJKaDgqzgbH1DXs_8B4KFBmD)krVdn2LJfYd&o+^mDTt1?Jm&wBX8tK+FN5)
zOMpRius+p{WfrW?xX8(ALa;2@rvt8Qx60nxdfeHi;Gcr~{Tjh2Nb?kWIOugKke}(q
z*bDHKAdzeWL@<+M#8=PHGGSz@;LKceE2p0-_#@5sgAI4kJDBvf+)#k~qJ&1^^21r`
z%vFXzCq@?*O^~D9VW!jToQ5%q%=bMX(zW$IpB2(fmN{?Fz40C9Yo8{M+&GQ5uipIe
z{h>`lI%v23CwSWbh+_7?{O|uQ&g}n~^5egI@|MF2Xjtu#Wad+Vrt%md5rHJOnt81F
zLJ?bv>7!YmPfxmD<y&M|FQh<fdk<3YkZsm^B69bA>h9$hl4y?olVk1B_t>R0tpXZm
zh~+{VYgcC)DboHxtS-s?4(PMq@acZPZXKX(-{0eSaF70XGys2XXLAz$lg1x2JO}Y&
zdNZTw3lEgCyJBnVrMr)p;Sf-rmAR?PzRb-_If1I>OS|QW7nkq2+M`R%7D+;&ca9OD
z%|$(C%ir3W`DQ+lPP{rXx&QC^?(`e?{JeKzJG=pDUhF@T=CQ^(!c0Cs>Lc)shMYnp
zIl6TxSAS?NcPA_|G0p82Gj{oWT@*Fr>eB~4wV(Hdk$Jv^BF71CGI(`v4~%AO#uC29
zTyBSo;-N^qKw3NFar*{>cr7Er&+GpC2Uw(6`PXwr-%p=VoP+QpbeVh%Bt3{K2!8A~
zrx+fAQ3b~U9K$F_%7ZW?ol1n&#@w@zL{*#W@2Z-ctdmQsh26m3)gDLAXfL%<jNK^5
zGeb*)ah>>svFf4l_M8I^th@O}qH3i+e66KE<pUAsHqKw;yypX4tJ<|d(}lpJciq~S
zmfM;n$n5IEC=b+3nVlZig^A3=yql`IJ@qV&4Zr9&0g=NE?;m|DId8FT_w;KPXa}^I
z;ziNhnMFt%pY?!|C7q8V(NQ;vXv2sN0SOM2m)G&gz0T<FeeR~4_dmn;-F%EVh;5N!
z8|TdrU<NP;tLrW|594)$OPzc8K2rqi99v9ccYXY&6N>ynnh0sjtx+I=5b$z}*^0DG
zigyedC!rtfwd}3AjsyKE<P_;w=fPTA(*w<Q9Ze?!Rz@5T+&n8cyt7DM`$#3sn#2{@
zODm!9EdhEoaWDCICdKRaY8%vZNDnGP^sv>B_Ak32o}{(VuG54$+xBUp;|kA#$~PjP
za7-lxTW7imyk0|ez07v7&mE}FLGUvZe|9S9WSUBLmwVK~eKU`EY&9MUDRNX4xTK?I
zh0oX9x=Kr9oM4u*U<=S{{VqZr*SLb7SVuQAPrw20iA~;Q%klK0ZUI7umbdS+%y!Q7
zS(le1tEeTAd8@5VVkck#)0i~8fY8_RU1}?g%HmETX*O7L^bLNH!uiw^s%h$;H}-n%
zk-zjfIHT-cyXDKsTLh)FNX0=sC<KN-$94yBQeaC3W`7`W_Kvj@s(E<1!gI{?@C`N;
zNOxM506690SDN%mhMC|O%ptRQ?AVhv|IITZ6WQzmI)N==Nrp$_HIM`W69uM2Cqb;-
zNbV?jQ@MM-77T0XJ`iQo`?T!*qj@w^B3W;?S>1$L)$ACC!e8oUoM}Az;n5pb_<3}}
z2P58i`XE6LMh++bjB*7xhW4PJ@8yf}??Zx0|C74@d)K?(ab9OTG=1Y$dhXMRE$~wQ
zofl(a$re-Tydgo?ZxM;diuybK`aXcds|Lqiiv4|Rbe`d%SeI<LD_!=8RK~QF_etE@
z5dygqTa}Q|SxgDt+p1tNhL?BG)OHP-ol-d+g4<nMQQLLrrIXI-sAa3TE@@kVw9Q49
z(e>c84xmt$G^wyLqqO+oEh4zB6L(9qF>+62q=HQwPtQ-j6I*MzG3a=22iRMul7RQh
zM~rHY^#K~C_hZJug8E_j{;OSXai8vPpT!t;q)Y$Su=2j(x^pRY$<7kTMsh0g+tIr?
zSy9vn6vNFL#5N;)DX|;#VrAXgB-_*zGZ}h!<HbM6Iwha}86WS^osDBlsxa&a*=OHa
z`Qs#EVKrqJbVIPw@Vs8sj74K|IuA;DrPuO<Sd;Q@>2mU$F$!lYGeHwe9=GM{re2k#
zaEaGXTp;**{Ja)@Iv-2B-*KGX$lrl!1+&xT9@Pbe-#U0<P<r%)mp=#6CM#SFomgJV
z-v8<*B*MY{*O3z)3WFD}2cFkRY6dD;M86rnrDoa+GzKQ2t-?CA{`t6pS9#XGjg5wM
zYX8=Aq}72P1sTrhuv<Re`~7Z6(V)Pso<^9p?07_z>rjxQ!`t98->%l?7tNdK3`IUa
zp6?!oY{q?|n91-xynos6(-)X70VRV`wPfAIbm8|8vK<PUD)2O`5zRGD(AA;N;#)hL
zeLPSVcj`RKhWgSWPJycukESonzi}Y$;da7mMSv+F;JwOV4B2<Tu0P}M^VH8ysEK?&
zV^(k>NPen>*thqbo(wi9Mxs6;%+)S29X;q<ZQoi6yNvvqOAKP<Ft4(d(Ap@Fn@QfF
zE=~U?U974a^6HyfU#AXC3O82^K2}NyKb<sV`%r-KQoln%c2v)LO2ledvmEE&dAYGJ
z_R0gV-uSmEJUw@Ob@(J!NYoOR4scN|;mXXzb{pNo_05FdXuDQBYIN)!u|e>;vFDeQ
z=&XZ@@YLIi9z5}n(^U$2WX~}+&cu@@W*LJOOzQ0Oq)3GWOs`p7R5n`@i?V8h=XHA_
zbp<A+y8>iV_m%k6^z7~YmKM800u?seWsz_^AM-Vit;BpV1CwFHhuQKe6bIVf!~j*@
zC|fgg!_0V@%ZEREyl|1>d-Nlh7v|AFDM<cFMZU%WA5(n>G^Pe2;l4qvi!^e{f0L0n
zr^_wD-s!PFX?*_7Td=I2gl-4bza|kDGPs0sYj_R!1S@4^JagUd0^Ot>uRO30+XYkx
z7b-jl&s+Bf_;B(Ctp>{nOK;!Uxzi^t$#76L5a!-w$_mExZD}t>sZhHL7t)Xjk^H*w
z@89MJcD~RUDySV2FOwWgwA^NXyYZBgTRgjl0`R07bxUnzs^9sB23mij7FxJ?S~*uF
zaptS5iA=_WEL-h^v}BdjlTWLWFJ2SN)o1&O8;tXG=n{aUY<FkQlc5sKLQ+TFLvEc)
z^xTK)p~y_6ctc^Cm&W@JZ{-&U)qg@vAhWy@Yh7#!K8qnfwwOcbwL7AvlB3*RL&OuW
zx9K7@_p5CyhwSs8H+w-Pe)3QR4dWEfTdpwJ!UdI>_4IJAmH(=eJtJ^5WpiU}<Jb#x
z6msX~Ua))%d=E)hr}Z*MIq%tqEOYivP9|FYEmN@*ySrvmJN|}ba}=DOn4tV@tTxrT
zvrMxw_0zPS*VRiiUmLzVlyQeQ-osfpxNQ%Awnx>O$SHHaXVWMLN{{F6s&dIqK2Dv!
zdSa9D=!^OYmRv?Gcmk8c-f7<H6g^1i9%K66@^;)Mp!s=!?UxVRAHBGpZb1x9e*~?*
zY-8@9{I_+82G}}ocP_{#ypksVK%6<bc(r!>Av@DUFW1bZ47}JzlT(v*J6t1ZrrYnh
z!neo%fRPhOb%+jB^y3aF?IMt^xP2vVw!g(ZmbkDq=R!9LKjmb6SBSHG^js;~-&bTu
ztjhR0a}jLSwoF&L;*5FM6Cgka7Ej@<56n!OXwB66i}Q9YRI9gNvBwH^p+v?b|C1sA
z?R@1XHbyRIU#~yPA~5h5*6y-HJogI;ijn3HleV!BgE-0%WHTe!u7yqu4gTz%D@B(A
ztzNxM?etAm_Dw8+A8gLXqp1RvrjS37{bzvGJRNI5MaI&X*^l;e8#~y)P6bx~6FBC<
zSO+dHR|vz4J~6`8X1k8vW9!W%Y?~^t0se|M!p%+AQ_r?u?BLB%g6!Jw03L-c4>~Q|
zbM*HJ3p&#8R@{}2#@%bueB0dAb7D%y{b{;!W&ZqS7`eG2w5XjFL#h~C>YqKsLWMUy
zpeMl*q8{-+yI%{(Yb)LV%twmzb4{jM*$oSD>W4;jpG*6f9WpN{tJJvE0$HIX>PlZR
zciwS@7o5f=zTdrCG~z1lJN*Zu{dK-=hLiU&7fgP-xD<QO7&jz%@(*N}H2{49Y3Nu%
z?k?mXh;b9B^E@f|dyo`E)a80N1G^9a><^VSnF7%<&gE|D{rCeZMR7y9Fxu{<w-0io
zvHx3_)5ZShi}5ZfxKW^N^Bo{g005FReH;i;+y1?g=zQ+MaIgVg1N>u@9Cqd~n5yl-
zmT?k?{aFRr85T)(H^6_n0is6&bW!dPBo~{+hHOP;F{8j!m||Nru|Mk)2(+gl4Ws}R
zX!I)rIfJk|*-2uaLx5;VupTfLNt;VRmuDcKI!0IpulI7k<rYB4{Upu)fj~@N|G*po
zzj+Bkvqo%EXHo1!+JAxTtGGA>%?HL7!_9WJT|kkSJLR=I@CRsJ(Pc^%!~To?{G_I1
zEvjvY<xZ?`zgY|k=a=9>I1L-z5Ey68ofWXmW?)&r2DlZ3%?;)s$VU+n&TcGwZ$4MR
z15o>z;Lh>#o+J^ht1z@H!wu1>pIun;mU(w;(XMHKuJTEh6^D%@ht-8Vx0PGpY;3Ii
zRrD{zhnwvGZvym>A%%_V|ACkQ#1oTeUE~;D<X=P?bjbcWfa;9Zf5hk?L$ICzhJet;
zg@l@YW`oS}pMxxp{Wsy`H2v2Q{3HGx7wqaX9mr@wFRj8v2^3Wr^H6Ghb63I@OIx%$
zHVn2rg(>Iul4?bkg1}Ot#FPt}#ZC0L6N8e_ikQmzhXwe~vj=qwm1>)6UfMclrPQAl
z^|nh%-#_*~^gl-)Npm1?L>L(YS^wSkbd^Z1_|=Zr^FSqs{~9pHtPPAkSl2v4)r*UB
z|4r0u-JHH0V(p^hm)2WhT#xZtkHz<=lE(6mmPIgpe$=myEoFNA)&EcdTV6A;tv97(
zRxFuFN?crX2)VpIxV>cJ%MYiQyS{rHu`F*7#Q!ii43it-`>hGEY_v;+ab7Zj@1PhD
z8<<jb_x{4E$$o``SIDi92M-ftN}N`A%#}Xgt^;kV=NU^pp-5vfJUO*oT}&TLx(rk|
zauBpDccR+BAl>BjN1(ke(_RM?`%E0?B3+)uZb<L>*uH1V#9L(Y=F!-mr(DalYnepL
zr4eB3S@Bt7vzHDBUdII#67%rS)+8d%GI5w`*{UEhs+(Xw?Nzt!$J)h%6OyDXOhEUi
zt^!PgjqR*Y{;DB^$VtN}1&_@-w81A887{HifVgGSY^@&s`}=YF0$!p1uGDegb2^ZC
zYo!$77gb`(yULcCsvApF^vAQyt;+(zRSX}bH0BW6fUcPoy|#1ss{MG=nKyJp>F)<_
z9B(1+^jVC@#0*%cAb&`Kj(*r5h?^i`0>(H^gSQCZKON9)@cA23rzN~{A;^2kxpwY~
z5HG&ND_^aoBQ(Zb;_TI{KzAccDWz@i1@-M#!&OJgYHzz$zVe*sdz7;S6o8+@pK}Ed
zgI8xqB`1}=H;yo|O5R!QRfaYnd74o@)@11&G~xdK{pOv-sje))ZbT-ZbW#XL7c)eH
zV#x(q$r|W5SSqhPJUFEB%?WMJtj;yboSAf>SYI$Oji?{K<NS7|dty@h!EJ6lDEA@8
z!7ybmsQ`yYXqCf9VO@`{-4QZViZkUX?>9e?C0R#=(ms=pRWNu3K%okZ>e6e@1NqAy
zp@kvSMf>hAG+ee(y=;H~-A)KkFvJ1=-y4poe+u0V%GCddT?+nt{qP%){>k`3m+uuQ
zrGim80%saKac!TIp{zsK9JnvnW4WhFwm1OYH$s>@j0YX|$}0ufd6FaUyFeK1&%blc
z4W>F*Ae80Y!4lwp1a6ifHRXTlU`v~hIXQUceb7P>Z+F%8qDY01*XmQIZ}=!*VklNv
zavqj;6}AtJ31p$#awj}bqJ?L?X>dwO1-L>;!7o84+?J|Q;_SO@P8_Zu%RGc#Dhpuk
z!?bWAOhbx<3&rCep|;6d_f2&b@>(&=ob;?K!+qk3VPmOFGv<lI(2P6F;G9aXtwS=9
zavQw+Hwz2KK7zTyp1_1LN5B;9f9&5?L$_?#uE2Uv$;G^J%3Wfz-yU!OeLK!`ABO5+
zyn18CTs*b*V{m2*Y=ShSvPAcysKcK$Am=NXIHzX1WT65DnWS$s4E>yA`qOYpsO!L9
zPi^x{>b+QZ(l{KfHpNY6i<pa)!i4)dN`qc~1vzzZNX{M)pI+MA*vm36O)=bW^dz)L
zC-?`Rke8`eQ}(JK^r^YsBwjFW`y;dKT2YY?Lx&A4nubhXp-{hZzss)$voo~){4~;_
zQ(<wd>{pNvA4{+Rkl>md4h7le-3}nuwf9YPQCQkV{GdgScTqaG`)zgE{hW|bL+nnE
zBa<6i7~3O9*EGd_%Ri!DQu@c!829N#OZm_xyYgS&06VI=M6L|~c>ThUHu=&x-9uV}
zS&zF8MmmsQB%SML;#fjlkkQC;#h8ZV{&vIbr&lk~B>G<di1DvmwZwY)+>TWJ&N}x#
z<@qf&R?M`B9z$5%x9h$0?fnVcUGm~FS2q9V1k6JYOV+^TA)|mEbd}aOl~Yvy#XI1c
zf1m9JymI8wZRwh+vZjHyqjzOOyI0pj+hRF;^P<EzP8+F{p-V$FOUBKwdME+(uCKRg
zjLfnHKhKTh_YJF|o-b|obtZnUP#z!VpA~q#PZgJ}x6D-BSmL3;Bw*3&B0JC321zBn
zb^hh!=}_@2NJ?X}uDeL1b);U2y3(3MCcCT+#-3C}ItCf>&T4}rxc=<VkM-EQgct+a
z3*;=t>L`}Z;8hJ9#R~smG1-ZH|I0_qx4oB^p6~EtdolTFRg4%WtQftMSyyCIFcW%B
z^*}@Q*|}ObYmYsSA2X!RN(sy6NV`cJIaJ!f(*wn3u%IP!H}??<YO+@wlv@<!>&BY&
z_W4*!lyy?SKHGZ-CH^W#F<mMmMf$`NvIP&4xt0^)8zK{JU85xNrd?iT2qeQE=ZXv}
zLk4XXO;Eo>bj;;HFYobR*c)_@f~SIaYPzj{6UkEJ_SWUda+*wZx#N-wvS}?1u=qWb
zXA(y#w%&Tk=Hpi-L(jKMmtN`#EQPSYyy}h+=^w{dW0#6)a*S$PdQmvn*j>Gp=rOkK
zch8v|q;-k)qpmEJk2k$fFn!b7V&PnB^Dc=sWAUeZ>;-dWG5v`5`b+&R9cCvs2D)3f
zWHNWJvy*WeQua%Jz&ny=NxI27>mZHD6P*JQ7xHm|_f}y5a!bwzYaYBSH+q90g}$0c
zn$J-hu|Kh{&f&9f9{5g3JX)>UKmH@ydSxc)$TURy79w~6o_f5dS}<305mu48M5TyK
zq@u=I_5+79KT3(dIp!!gnQL$=O|$dbVjs)YPc70-$J=H2#aIv>!3bf(>2>6QKaiz4
zy2^!+Wq!{<vH46%Yad;q(Ta#Ga08XA7Tc1ZoiJIa>e}=)@^7xrN-b5;<YqToK5%*1
z&R-QrjkLF&N_H|zRZ5n-v>h(~t5?@|KkpvJs1029-(J`DH=O9Ey>EY+9DLm7keyRE
z)9iL}tW+=5?n-m>uwv8Tvu51#+%rO&%+F82rg>a0IR%lgduc|hK^`=1-hrl#IOMW#
zhY(#?ioeog;Y!L!wH^!p{WQ~kV)M3u)0<A9SzLGKwqt9hI<=WfiN)$>GwxG(MYK-h
z>2qCqo~gGPof?2C`f{MP+^?oC$8l_}1JMIzOSD}chPlFI*<j5GFA26)S;g=NS59}e
zQzX6(1;g9VYoMMcM%_w5zYpGHP<KUw<2b*;FerN>TbQNe?Bv{8*`aYq$H-f!6!bRt
z0ZG@-v#KnEGR>sZPwVrX{)aBb^)|1G1OIUO_^Zss!v<}I7I0|#4)WET1eONKmgkS+
zhzo4+m+&#*!ZFKj!7?24|3C&^5H%w!`^?$y4;69{)#0&oO@4kpZsY{dl|JI(Tkq@r
z{SB{Pb-B&2_?oX!eG|ao!aca%pl1l9R<QO2OEI$P1gZ<&#cBk70d;>vvi4ziHUC$*
zlHx6&#^kWo>898d!C!X2Dd<Vhm|C9vy7<ykGB3S<LB@t;xYK9;0`8M`5_AVwn0<H<
zEX(jnLUm@Rh#Uzii%Pg1Wi~ul@AkZ1_WAF0%^Kx|i!MaMfu48NEdiF<Y%aBmrH}cd
zp#HoCE4{wF^wL6e4_uVLDMo2kb5B66phCx?9}ix-BueZUPkCfZTZX%pu^WXilb%~c
z?FSG!#4)z`9|+?tjnv}BH=EduEg?<>S27miU;Xpne5<W{2fu5T?zOgN@gi*JQp2ms
z$DI}qEeM(&Zo-Me*U~SGcBg$K{;FYy9BMMzKjiIiI3D06)u48ebk}81vKgNtPwTcW
zh|T)BQGz11u9pi5arE+AI~dE>2k|=(#>T-&pXJ0p0fq_W$Z^D+*Kg6*BQ?Sy#|mTg
z3bPrtV`GRrHuqCB-@8^>J&pgZ#gEF>`&-yaV|;^;J24FCD(wjZ3n5U+1<AB)!QTEA
z$Z^Ht`^N!>OVL~w609>L^S&u9w*Kb3PQnOrqn6wBq@*n^_x08-b2J}|7c&JD26om-
zPqIT9DJI}8*rgno=V8Z;V_90~TF2jT{wP!?l^r8jl%47)qXwCccZef_oAw4)$PI4s
zuo?MAdU`*@fP}r9E8bP)e0JuwUnY-5VXW8q(S-2I)S~S_At^Asvd`pKznh|{ukZZE
zyjw6R%*>w*t#Yi79F*^Q{xcvUNFU+~q(vYl>&!Ti^wJ24vkczB0AU@VGd@e=1ATsy
zx+1C5bbG*de+F&{5f25u;1*ICSA%BR)*mgkqD7#K*l>Hk>MlCQZ|DaJG(LNRXxxYF
z{WN}NUn(Sjp?{*)XED@RLt()01U|zgA`cJa6vMpxCvXgt#sPnkc!T!A;K%LvdJEiT
zjZ3q0DuUmZcUqN}wVl7M;G~^Z=RFfo^J!+C<W2ylJM{`}$c|Y=^K%R-(cF2qY(2!V
zB&d4We=;sWgEn>T)AouJHp%>LyPu3=T8mVw=Qb><2QW^#Ha3W-IbdT69|res(j-jQ
zRG}Q8Yvq?tED-%Xr|pvpoTvNUZdW!bgi`Wuj|iny;$Aoe>eNf?Z5^Qb&5E?aiM%tI
zIC?JmcbR1UOs?o8rwERJBAVJM{dy+M)ivOb>wXE>{fUQ%+pau$P_#2+a=WK#qj^py
zB3cyN)3+tXd5+`$Tvlgr38MP};qLjGHcKDD>7RBeO2Ia3o8`*0k8Nq=vI(A~30}s1
zPMwL^G<rAZt&zGLg|K@X-71%C?|7(ld+q9hs|V7t5woth5a$LE-oZat2qoN41e!gG
zd1$1~iTfEqVn<-xy|A;SHiPxS3FeqZo%A+Cr$M8VVRP2I#Sfg3Gpw%HOg}Kx<(w$J
zWl=!7Nt8`2rXf0cWBj4fvgiiQp5s0dDJ_pWAL>|{;^w(cB^1I(kb5bhjX+K`OQwo+
zUOP{~^g0ibhczb^D!wEZt&|f_Y*mlcS=i=Fq6Nv=IH&+dnb;LV8|#pTMc+!^R#u2G
z4djQdcN3cpS?YG61ml5Se$16vkxIgnzrmOXH`?WGuY0eO({oY+j-NG<EH3^<-xj*R
zvM~~z6EHOO&oBzuahQtZGe#1W9r9sx`wGw`O$!*ULj1-ed@@Ix`oH!smBG28pf-gU
zVcKxgi$TGmny}FV%BI{~1ad8mW+nof080p#P(Qtjfua4bD+rE3!)N-AdMnc<i~RE@
zFAX0E4_KM2zq6Tto2S##CYIw03P_Iu{Vr+sYR56=vG$X9Y_XJgqlQUaTSc6Agfy_S
zhe7REVW2EnxqkEyL<=mSBtEwKCp4DcMlNcNJE_f84em0deYX*+7+x0)adshE+A(<$
z;Dp%@ibOQaw08Wiv8j-f;1Wl0tKAI@?MMR6yPqwVHsg(xY|Lj14Xz{KM(<~aGyXuX
zN2e_A&iC^XnW<73&Os8PW8c89n@H2^*pdW>#Rs}j?rxZ?`t^|6<@m~Cg(R}P!SyqI
z-h9vfPdl||Ep#yDdFCynu;d)@%XZq#!_=jRnQU=uXDEa#WYr31+UE#0uhvlQwepOe
z>@P_u8Y`%Zdpjqjm?#hmpi$6WTuHP%?YqftrsYhTdMnZV{iJY4ikn1wlgofhgbid1
z=-<qiB^KLV$V@3!?No_p^Kf!Z__@6>LEZjvY*l~cJ)=W2UI&gld2FFipBUb_3w!YW
zp6ko*K%-RT#M`_NeHZ_);@tn-Z1SH)&i~&Co8x}kaRnPOn4@TWP6q7ecsCLijS%IF
zUVj$&JMp?=t4&<JP1nW^w@2{0d5*-`>S21^W$54hb@r@=*yjxBcD%5KQB_Y@F*9;A
zF6Gy?;WnoBU~QXOcIoXiQ+X3x)j*snBUXI@e&WB7J=KqITpiTyeeio-?yVQ4=K7FP
z;@gVyV;L$D((7ev+#p@p1ac;mdIc;?;hfiK6?%{4m=W~Ne2z5!d0T3gW&~%}v1_00
zw|6?5Ij4WG4AZ24FYGuohrBARxV6UspQ<}NDm;Afi@C$ujU>(V$jg!Q;RZGZo<#e`
zTMET_&K(bssy)`cF^adAdXtfxd7N^zDJ{;9(^O3uFQFubp|8wh9~1c5(83$@IXh`N
zFUD$zZuGG}6yo8>k10vgAt~F&r!Adj*9qfQ*P1|Y&m*`J<Sib|v$OCh(vEe+cISnM
zi4nN@3x(sR6l0Uq^M|rGQ{Tx&AC*&8ua7aB3GAvF9bJJQX02l>+9DqM$sWXl_2%jg
zm3v36)$%|2yiiNsMGjmm6ASXRJu7);a@T<`7UH-|p+^z@+U&hS3@HwnvEGQVGFA{~
z;+h)zG+1{!gw3wJ__kpCsrSv*3DN~53VCPG`BMju$cPt?T7)*;lla1IY0!8GEL1ER
zbu^aqH-w<JDEmx}zEq{mZ)MSIO}Boe)Cb+n_ft6g`tn0V_kBfc4>OOiIK{pFZXX>V
zwcV?0tnl#H{vDsyeY_Wy?&*{k?N7R`zT(6%iv<ZzxEk}C=qpvGkMHr>_y>CfaeZaT
zSwt*%H*m7Yf>OI+u-XONzNUvpu$j1$gdL{Cl!``;v2CASPvux8cSyk=MCjZ1Fy`;i
z!HP<>#e%W6gxqsOxU%0`WmbF5_Py(uQB?GOoIX2jyJ&LYtC67>qhA!^G~U+L?{l!d
zDVa~>)WRv;oC}4KPD8d_bBn<5=YGtsa%d;<qHSmyH7+5j5|eNSiFh1;SNkSDtLWKz
zEycSUuD1FsR=5fRN;~ZRJhV=iPy?l2pyjY;Q7w*ay`AZ965(2}^&`DUPmf8#CK_iw
z4rclMIM<j3As=lWWHfkjLR)$;ZI}VVK7ZWAPqIjyiP3dpf<07l-ofBm_1P!NKbL+v
z^*S5~n~Hby*m5wZxlmk_0391I&rG;i@p0=%mdD%XN-L!mlOfse_bHrjvnS-$LDBXN
z(~Y$q?F6E#%AD=Ut+IHDEMsc78JJX*G4FgHsvoR5u@Sj4==bCF*Q0kFpWQxkL<-{Y
z%K>x(0I&0f2@jf=Mo2OdWa?j_jaE_mcYD|_N@dtQ%~)rrr|ImuhzV4wQ_g<wTW7ta
z+E|3q#lh*qCqz!uEXv-_=Q}>{#2gt!t0nJZKNj#j-!pcS_%XURDKq}(+m6r>RFK;P
znT(#xSv;NJ4@5!F{l`PqcQg-Ed-f^wz${KW*t_3mRGX{F3~39J@;j_^_RaLUPNdxz
zqhgV1h#z26iQBc_5aJ|b-EdLRishv!H<4DV9A%?j5GsP`&r?6msC9S{QuL@!IQ7#R
zWMi<QKF^ri3!*Q+r^SV_&<5lbMzntnR`;?(;!187I^IsdJ}ac7uX<8O{+N?cPwe$C
z5E+&C+Xy*0rmD)4AV?Ei36<#^R4<m)FP9mhzJ=M_CHdN;cz)HY((|asyF+!uI!@H1
zeJ?)T+Y=JYnWsh8xdLtI`5eU@Qb29ZE#Wp(qu?D{E<!f5C*|dKzz559g63{l3{8HU
zoNCq5RJ!Wo9sfiWD~1pUnWQP!Q$!5Yom;6AhjoWX)s8-_Et(mvI#bu6)Ow*~_{fpa
zC!(jjlZ|~4F9D#nuAL*t*v<@KS#dw#I39RGo2>OZl;XALxcc6+GgG$8O*M{>Zyk}p
z2C?jM<@?PUSRr)4!VS7wp-lbN4z4P_^nQ{A+p4jOUf!%>w$rKE&9){z$OTf_Hdh1@
zQk9l?@h&8BDz~T2ikkzZH?2#Ve(+J1OSm8}q0dsUboSrotJ9;~aYqz%;7rvuY{lCl
zD(pStV^;_C-X?y8krKebYM1vte=z@IY&?he4<tnj<TUR?aopHz>=)0uc8z?T)SZB#
zpHikwMQ*cP{L&bzoc?-x;eJ@Z*Y?!6#=Fdwuxe}^c!w`yTg;8lvao0q`n^ApJ-Q)Y
zf*mH}gxbB=CQ9SBX9vC(3ptzFx5@R}*YgnfNuYu+X;@nt`zar2P>j6*nrb|_xyRgH
zq~dFszG*Q|Ho?o-!o8X#6pU}X#`Kuo^*zw>R)V&-cXe+O<o%925d+`Kzj-;A&)A&Q
zQw1qf<o0Tq9n2laJgiKh0XtknT0aBI#1_sI3v#2conTuB7^WCM$xP|iEcJo3X$`ak
zqcf_78T-l>713iENj0(rYP{I0Tq<HVT0zrowy5*EQ^(P+`viQ|fLArE<bv`rjDDu}
z-JztUM%9h}b(HyG{gB6|=HmA45&zj~25Q6w*kykdF@$l0`Iu#doHQrIR|`^A4TYlb
z<|vEy7*jIu#C2Vf01e0O2bloWF@T_*!#*)lp<5pv^A}cSBCa?{`!{}lS(@d<DXY-(
z`+}TUTy5j>YXWr*e!fH)5;}4BYNe3`{VwZrb=duitHH7SI8oKLRike%u`9<MJ!H^{
zrN^eAUpfCR;V!VIBY1;*DhA|{6mGz*;#;cwUo&c4Ka_~z3Uq**`N>HxNNt-P>uBb+
z9?qL(X^pzPT{v$laC%~*bWF(9C?Y-ukt0O{=Z(o##=E2?80|c4!m`~UAwDjGj_Qzc
zq2Uvs4;|JB#NTRYNcxfRK0z-16VV_{<X0fxkW)nXOMO*Yx>bPn#0<m4jf}~o?oGeS
zv{9m@FIno(G-gSD`umfn&P*AXtP~D$clm&Pz>rg%Y7E>QR9KD+*Y3W&&qr)JKui7V
zZV=21=?pk&N&I2SoBQp|tY%G=9A<=*MIgr`Xcqv=?Xrm~V5Sf*5a@!Rl+&EYHjoC-
zd-mQ1h`}2&j(c`}`R0px24SYa$e^2Cji6o$Bx<ww5EwV<ne8Db4<^jgvGJn8)@@4G
zfLd!E6r;5*D*oxI$|Fke(usXld`n=p-tnGOatR<RFar&!7S7J~C?Z;qmeJ-#{J8s!
z%hjfBGth`{U-M>vT(W9`eKiL4payJ<6`pS%U>2|)B&G_g<+?Uvlx?#+g6?&jJ3^OV
zT`(vZ(iNgAZR|mR>3GK32_ZYI=_6ucv}=DLn?~4iZEmkPcI!Nn(SSFLCM1ZM4fQfo
zKA{0dG8K!%jvrtC12NabYGG-X%S;uEdU`K+R~K!J;+52u8^aY&va4?BOq=0&mFcpu
zT0bsxKAU2$kADm8#x2y@4V49i>dac<<z6+!%p~)M+nN(CEH4TNivX_S5w-y*0RsUo
zDs1ZmtvT98<e=kH-&(i_qgU}<(1|PkpS!@0s1<p5FkjU&aO_(eI28q4%+wtUabhR+
zqL^yG<EQu0`<HIbOlbWic)=*6DW2Zt&(W*T?zzErEVwFYZ<lY`eVhU#UNY!6V_QB#
zA$Fa|#XJS`*v4b)PI_5p8V(sll*6Ztyf3ulUbpOfgeV5gfu(#}8zXy^3uQ)8<fbhH
zXk+d7gY&JBXV17$-&@e%K9x8ytM*g9%d}5Hk@FNU&CF+1(n!?wVm&e)72&T=61l2o
z9S5&Oe0ZSWm55NBJ1M<ZR(yL_+C0D}mBmoL?9|+d`?%S~pmUNiI&5D$sjZ&~V>r{j
zG9&P+XpM}m>#9h}AB)z6T0&e;4sphABf6ZRot>qgv%@#H5=I@F1wy@CU1lar4Ks-0
z0c6n!^q`sF={p!@ULL;M)qST28TyXpWh$3;x7A?I-v6jGd*WuU8D<Dkq5TNvf!*T)
z6TL%+#iI98ns-lI8)~|>5=%4o9%}k5Yc;K%GkxVAw%6(z?n+Bb5vChRJ@B<IPpI3`
zE#uL!cgP#+bW}o7q*qWoMyZkIogc!u*(yB|o~Ps!=~dAg+q70X;^`fwv5xf8^a^2c
ztD#Eu%koJ8vPLHLzAXd`xpT0=v<$DV?9bT2D}0Bf7c=`J<r7Ew6rMPM=5mYYV<_O>
zc9IEeJQs7<*LV2mR&4;gKH$ngpocG0;4LU=W~0^D@^$q-XQpw$e&V)iMErTKY0_uI
zq|f4?PUnQ(bfJ>sV1h=1^tPGAW?B%%D|)!enBpW;O+7ta0oU1;iG=i+9sbR~Kb%K=
zVXSDKfRe{yI~n30z2hu>V%O{#bqvQGY9rB5on`NWcYJr6r8S;<l&vyO`9cvqQO?Lk
zR5pF!#hJ!#xwS$;Ir#1;7&+A*Baqm29v-=hKECIIV06UmSZfg-OJ9QWABb+o-Nhdv
zV}*e1`5xfW?qGL$=I*YB%P>niO=OyDk}g&a8rfj#7}<Ao`2(z-F0Z7CZO3XXYn@Xo
zKCumN9>?YPVjZ3wC$vqX#DjmpBH{N{U?QE(+HA$fAjN7E8-<|cj-9tx18*in%C2}G
z|Nf)W_6IOGoBNvNrdQVB$PQw5%)$XcrCkTWJDU9HMmI>)6Txw9Lq*wlQWq$#V^=em
zf>-f@9-4bJbK5>mu2AsPoCx#<`g5>Xhi8$3$G8IKGmdNU7=WeImiXCLp8_Ci18bkG
zyMta?oYU~R-X{BILnZZlO4FlQPC6$a>mm}4t?v3*rp@r9M^qCL5!KclHE!qRSs$<O
z#}f{AdT@Q>Y;^VzUa`valsvlP*4oOerr0&lDzD2sa(FbNm?Wl)>7KYXim>+{?>
zt1D3%>|Kic^TT-d*(V(-P(7jN?WUa?Sc9jcpS)kWaMn1L4J#ArfNrm&qKd&=bQkL(
z*#D^R#_r5^Vqn5dueBaMxbc|jD{Hy4wQZkT#Ee@X`}6f&zoqiRVH~@S;HpjRn(3m7
z&~c1X&a367141y-CMn66eud17>d$PJ{cF;C8lIi!%Q}#F_Hv_Z<f(mk@y|h_hMbcN
zk{3^$v^zp_UDM$@4biSG?8HfI>%dDkbH9v84!;i!KP{1$vFF>SuSL%;^6sM}m@kNm
zq)38Wu1Ijcb#5n^PMxo3B^=j{;T^JB`1(6m=yb<nH8X*?S|2TC6TZL;j}^y1j%rw1
zN-rZ$g?RY9(_*+Y&A7q~%m-v+;Nt}+ds%c@RwZI3r<svnUhH(0OTp_Qljf@{u;3j{
zXGMiOJ30+UK)QGO!H>UbR51VfC5qwH)xADnMp$+)L`>5L32?yLnaG?OTXLm|oZ`H|
zi`5JPCKoE05xE67K$EWbJy8D1<HzcLzVH4A`4T9@h?9z!!nDJ9bO{qf<jHB1M{lf+
zzRGQ$S^fRNmzpV1|DE1pp(r84LWUMAH%sYn<`vucd|$C=ClxKyw~1_CC4jx93Uraa
zapo|jG0Jx6N);vo%%sIXka|`R>EIv8!y8X#36WUyH9%vipZ%{nP#ElgW{FPVC6MO`
zn-;=doJ8(!kXF_-W0>=VmECm!U+2M&fLzb}M;+w<H-58Y|DN?TadUBuQ!kBzKwgu}
zX~L*l9b?GDTTJBZjMML}6`sBP5KSxYD#k4Y64FJO%JteT40qbLk6X>YTw}+j5I^l*
zEC1Y#;b5-|m7w{SIKYU=;vYstXcbxz!<L@ZhMg>;Y)~EEEd(C>1#n6J9LX;|U!cdR
zmyTx%wEyt>q}V2wK2Gg&c9?o<>vQoV<{lT+>?<}pUm|CJN3q;7oi`MRjP&XH=Z134
zhRcjh_YM7O_>sI+DHswF7Nh7sWv18_e$m8rGL4F@%E@GTEd&v3Ygn#`;FI^fH8V!L
zPSI*=+LMdBSCt#;8}GW^MWo!0=WpGYn~iVGOAWvHD+nfSvKQ0&7kq^p?+Zu@M^s>A
z*}ME`jdsK3HS&twJ(GqLdKVxhlT8;>&tCb#14>;#pjmjvNKcMBm!Csp47HQ0pk297
zxck8uRP{t+6pjpP1?<W=@w<GF-FKtU?}V(?!<{3pKT3pjNwD6Y`0nt8%?CVjB`|&2
z*y!rqC!<vplfn7U&2))1zw<+nLot&-?r8QD8%rq(M4VMFd;c2)|7(thCBHF|;Jnxm
zHUJyCervXVo3TRctaaTO+StuEN42T)eurl7#XGgN_8Mm@OHpct+wO$Jn<ua>x(DgN
z<=3et_x)IhaHs~`l)R{Eyo5H*i*&6TsqS3eUmtd^qW+3cPx6y;ONp&IbG>uwE4y7*
z^=EP;31pmTaMEd7R|iT0t&0+w*AGnfeE22T1-(^>^Vs*eFXCpPb%=*joZ8@yR=j}F
z-NvLi1yDK+PY({EOczl4Cl=fBJLw`%^x%r!Uz@JD1-bbYRG*Vz$CW6fPTo0rchzBk
zviQw?oON@P3z|O8>{6yFIG22Dq7jympsVf0d`&SlwsvB6hm<TdO#OI{JcNIV3@SO3
za<3+=`v?C)F=d|necdPe9_B8|08xab8R#PoyeN0XmkE%J=CtxOj{^6t$gmA8Q7Ur8
z_UNHOIP3MZuFrOE%Y6Le^erB9fo%Kt&P;8tz#*0r&}(RB9w=a<X7D9mc}zm9!hxYC
zyI0?CEVJZp%$J_)fTgcSPdT!?S9c1=3ytld$1$v#t1NF!PaRZ<fV^ONik6wAu^t>i
z*o{;WcFM{hSiT3}(j{L%;^r$9v)`64{eXZOIcZ2%_5OJ+j2d&1aSVhi1~ay09|tfX
z-{^JWWq28np=<r+PE6ZTdJS~%a|?Sh=jPsnEkY5dX+1<=D`Hy&C$FZawxTAnr*HeE
z*4JWEVej%GA~)4K#AmmM|NPEJ{RtvCP9Fn=XTo{<a$CQcV{pi6`tD5EnSO;9AEm_r
zZ2pAjCB&<0F^f+sHhbHRM#a*T!|OLQ7jl`ZaKQCSA}%vd(H)j%X_Qs?fS$b9y^KL5
zdC`r(x<1zH&FRDzP~SX_D>6kOZLBA{5J&1aZ@msias}ae<KtkKPr|klM+Z;5?jJ{+
z{g9sJ!T$Up_ulwu&PXQfG`AaD`*Muh<;HU5_Q?Kxr0E9d$3^_CqCdG7-(yLX4fJ=C
z&s~a8h9fdx<3<e*hv*@9^vB?L0n+xR$tmlrLs;YmvzfBasDMIi;bdAsZHX{(`XtZ!
z!mYE4D}mmhY#IZnCvY!K#IP;eYceqMb4e%-jw9M#mZB-8_4>UHe!#A?K#n!Wlzndq
zTXOv-o+edpD9gE%mwRwQ3TB8m<hExgC7B%L_FwC7T`YrH&^B6(ou``9yPhefc57r0
zs+KVnwdQ&_EBU-QPmWk#tmol-`+=@j*nh2kubOdp)#wqz3FJC#4Jqulw89}8;gQj4
zKCiDGeuBxy>=8cB0HR=ha~I3Nk}?4!<6+gBb#0rH@^?{>8erouKdRQ{y0`Cm^zq}p
z0L9qV)kCb@EIgC~v&Qe_PJ;>y%md0)UG6|ltj+{x6==8QhRfbDz{a32_*OnI$(8&@
zv2#PJWG9y%zrT}Z<39D|nT+}VI(gO&kbd1Gkh2k0HN9+;G0dT>!OuGeZYT-PkOcA<
zvtOjWmWeiZ-^VA@Vt>>06L)VC#ib2asZA|r?LZ%-OM-^=j+ab*dQfLBzuOqf$P{~v
zoGamEIkSn=+X1h+zCfF=)ctwjcuYaYAqdsBS#C>|^8!nLjR4k10W{oM=x*38&?($m
z-T3SE2kvks6TP9}FSNb!=P>VsRd)T!l&MzLS!v#%1H{la0NPRjGoyq%a)F^eevILm
z+iGxrpgySsCXIqsytY+56gYppqkPI*_s(7oH=d{OR=WGKZCP04GO>~xki=9|p<ba7
zq&!x<lbMDyD=JUcw!7KLBw9urZP?u7TjqZu2|F&h^Orw?d2*7=&rK`LrAu|m_CLbO
zycEVo8|fEMeva71r!L>r%s)ByxTOnK!rlcQzYE-xDck%47s|<N67kmG{s$71Q>>8T
z>uYdhbDz&x(Z%*le5IJr02dBps?;s{?K#UbVk>{uwO(ft-_!!C#PFPw==sb#3#Uw_
zV|S{fiq~-jWIJAl3)+ZPNuBB1G(QSoB-@cUI%c58$%!VPc^~o3ik<#l<Q${F0HXO{
z3Qk}8*-{q2$66894ok)W@K>ps6jhB%IBvf|eA2EYx%|MJBBrPye!R2A$OlooR3L{G
z61$41(FcGZZN^uoOgZKYwmL9EDjIzl*IDdyZ)u>x=#H*$$3aoL^<Kz8gXra?i|X{)
zAuhiNXh>h{dc_oEbOT6w{;yikgB2ABiKaNOZTC|S6A5ZUYGHs{uv5L`^Gf|~R~ZZY
zijl~6wX-GAT3rW99s5KSrHi95_E%sj!Vvs{(kRe4=voM(nvW~K%$1n?11YJ&zLbX%
z1vlZuY;~Z~u+NO{!c9Ul!T{hO6jEE0m_8yMNn6G^A}CV|fP6&&HbNwPdCW4!9%k8@
zW7L7(5=H|%K*a#LB|8Gs28eSUU{?HrSk|ykVP`G^?Iz2uMu*L>K$d<E=IM^>K>gb*
z&a}fMVZRszAp3Nf^j90&!SfHqK!m!3T|~Vg!2}23e{;=$`)mtT#7|Qo0``<soCky+
z{>?R`MA?oYh{V4tR7F@?KIeJ~`xmTQY9+Vk|4FdKv42Cw#|Y=zF$!bZ_CT}hpDy@s
zW49iNTYbg-is-=%kIm#hS(k%KV_Ux1S<XDLWpTwsCbk4M-MSA%r1yQPF`}DknJguV
z4}20|t#~wlT@1YKB5eCxH3Tmlk&kbvW48$T6op^`z)c9Ia^2Dp*k*pwavFi4Xq@c`
zI5S91fiT6OZ76b`!vTGs^L{nI%bkT=^pCe+xx6E*qCP7Lb3LtB#DRG3x)73*Zng3z
z6ztLB4tO6R=5y-rAL#$*rwDv72OtQLu!$0eKGR3L14oSOY;XuSf{Cfs8N8qzJN777
zi$C&C&UybtB{#)auS8z|!u{Z4ncA=p6jA2te**bcS;k|O=e2xLlq4}-jO16hOFnoR
z?<%EkE(PFE-D+HJa0j6@A9!4m{+3h(ei!rj^hH}DWAXAxq3N$asfjVg$ylPaXDaiy
z*$W4|t*d#2ECSPsK=YLX5)N!Wux?|(>)d@A+d2i{3CRSo!INQJ!Sy8Wxst7ys5z;c
zD8H>c{|n#BUkD&PH7+&G9Xcez?gMk|-&{AJE4j?x2Tp#lKM*i?X^2wn$`z<x8T&Q$
z);DgJD((mBUw(pChb4d4-UQeUP=5*MxPVIK+rPIcg53Rct=k-X4EwJ>0TzZvpp<(6
z+Ye|jShmz6l$~xlyogl|aSVbLo>Sj^{x3hl)+8}c3UaYrurQDbU}}ATXHw*}X+~_T
z=jhlVK9l~%2VgT`NLoH2R>4C-!J57W7WG3m>jtDAg;Dwb`P5kMU9e#MtIvQYXApNc
zI3MSNawm8|?NQjKT2k>S_Y&j%Z|wP}4y*eA`Xm2pC_^vStwjCbVf&ZE#zx4)$g|A~
zrU5=4G<9dw_{q-f<`~7VR(GC>uzu9#ffuVCjG>tcr13h2h^Vxv2zd4JRIW4Aqqu{j
zdj^UzyM4yN^s+DJ(yO4wm<-9%py&}Re9>BjIhvOx3ch&>d#j(Rib*j3xl}XTXzX^m
z;sP-;!dEz>$|Zj1YH6Y1kt?Xo&sW?x?n{r6MjqYztBUhaw1<Z~9&kwPrzNnh6;S;?
zE&f#jXp!0SvRh!}b=2lLR2X(C-2<@|G^)P24Y#<#6#=XD(?TX6x7$7)kmf^g6Oz>+
zfBsz?Y~bdwjKj4GAn^$&{LMRsysfS((WM^O{K_2B(>y%q2bzX^2=4^Ax0^f)Eon~N
z-G=n^=i6#YvY6fT*$w+{frRsP6Vejln=x?M<coUAHtIFy0EZ9#qt{gP@K09(D)=9^
zTrXu&0jA#`U0%j%u1(6+$W_&<2NNy#y9qp2S(5-wHDmU5a36_(hg|~)^b>)x)<)Ns
zR4X=$*X~0ttuXGb`_V<a1=@su>fPpNP4Mlu4YfvjpPMryl%305Z@s}pbGryNgeU;O
z)D{Tt>d4tQP_Ap$1f@`|TXhb7E>VYTWhb9}8Tc;3VJ~EF>e1n`(RKDZ__;%vqoyn)
zObek}yGuI(Z<f1@ZB4&hP@<KUB3{+var2rXN}q2}?~}OCipyG*ljTud$$qvt2p0n#
z$C!_!WIm?;<n{m(Wd<zbaDkIvm4}z~OE0D2oEfbMQcADrs~8j0q{gcjxQ6F)F|42F
zm@`};et(GoC9#c}nlswty_Vst6m2Tso^S5frRs9m54L!*7fTw$Aba#`=Xj2Yg*UF0
z)ZpVeF9E^z?<ipoHUPweq`!7d&h8IGg{fyV+vvX>7!SkzRV(5iC?#5lKJRur5V`FM
z?{@dX=amvk4R3L6xsMURy5}{PW`h(CVnlGV@cZ3C>yax<^mMp}iqKg0mtIHq-MeZ%
z*F3U#exfd+-rUO9ly9!*H|jPMurziY#5Z!1O;p*pIB%-&45D3V%CnaSjSwEUu>;<y
zweQ~&+|R1Gem(qtpYJ#H2OhPzC*gxhu{R9rnAd6j?a)2t)a(^)k@+!$xR3E-Z=Bpe
zN0^D8Qqtn-l4#-f{(;C>FN5al16mMu*Bg&P4@vxyLGsAK%4}1OWTnOA_=R`UkG?_F
z>G%Z{Q(eCUHU6_5T7zg8*~c`yFw|r|^Ss%ND>3AL!}XoVF(<>f2G5>4AL_N3V3lWe
zIziPKh;3NU=SdTLx%~AshpuUvA@ZmZywv?6UljY~t-wx0rvwK(b>R;m0@Q3JRQ!^A
zKX%T2nK2o=?r#s(iuzq6!8E1^&DK$8Si-+BP|h3t3HB`-u0xv-U*qC3D<b02po!1%
ze=fLr#^NXZLoCVwTUE8Ay{B=dG0Gk>#TrQ+QNEERit6*6RsW^FJC9%tHnWVsJxZ|1
z5$hOt;ju93^8F1XlkE?uv+&c~>)=)6H@Z=x<6%NiOw+!Nbwhc<&=IPSgIenLPs|$D
zZDhk0Tx`>I@^zGR)llip+M}l!44qRcSuSBEI;T^0E7M(FWTDRj1c~a(`--R>^ca~y
z3~9xm01`B}SXz$m2Gqhba<f6dZT)ch61z-`VmS1~=|SC|CF23JTeaU0=+RM2t^q70
z8m=J3qyec^e)f49Dq3^4k*<}1eBHKfp5$Jqqj4>u?@i}vdS5Mc?NwI1Zh<xB*g`IZ
z-~o^QE2v7&L&*JQi|oDLrV?SqTgZI4X!U6Se<1J81EGBX{b8bHoyeMHvJ{aJqR5o8
zhjt;RlI$kQR%6DJeTza-B%~6PeK+<Ym23$yV}@iIX2`fPGySg4bH3+%zvq0;IluFJ
zo<Gj>Jb!3t80NaK`+dEa*K#`TsILA*(ZpWq+<iMZ=Zv3IzZ}HGokK<<IE=Kgt!MBk
zbp+Wcm)1bQ?19|!gEhMD^+i*@=9=$$l^>*Dl&Pe-uTJX}w{WMD=c-nk-K~9Qz12<r
z3=I%G!oiITThyc5QOu|aAD_3t8q{??R)>0bj_*C;jVzzy-n(b>6g@1<Zz^<ci)l*?
zOTXAgSiB_0oJE^LfCIe!3=Oxqi%2^c2-QrDn>3eIj7Ya>2HzQTl$KRxT-^D7q8q(&
zNYO$x{ZaIco|_M59=#)h&_hcwu7jSE-5akLPmHS%WmO*Yoa6VCB`kwp`e}=zEAd?)
z3V5ZJ+{WH&H)rt6HJFHAiv3$J;m@PmDwrh@JtP2Jn^d1$g-TYBy-DkfXcWwY->P?v
ziv@;<Vw<i=fTW-qZ|NKDMPpeYS@h^0t`2@gj$tZhEZx(^N0%tpY}mfYurM3iv+A$s
zX%X^VGGH0ruT*AYXw5w@D<kH5{o%<)vqC@ivHizIj^`dyPAHOHsQf$r18%!)jM-cF
zP18CbhPvlh>2Cjq-lyUwpS7N8Di)$2QEVY)<RT}14|NSL$Ar<uaRGZdOAj%R7WHaz
zOv5x=8>H58<$pZ$Y(ampwFyo$MatQ%I}loqZyU1<d{~{ce#Eehn_0bTT1yE}<ov?7
zVgQ5mxY*$QA$2N}GHU43Q$2#xU545eS<@axEuZl!!>)5l`7(avF9^#`z_|?oKull>
zle;s1U?0G%ZM&$uy0t{_MVIZU5)GCSa6YkkPlL`C7HWL}Y=-;bZ*WB`-q-DG$B#!i
zS;{_%hwTnlwr!mD?Q$M-^zHDkTmg=ETO+_fZd)G-0Ufn#kr?Qm0<j1!2*tU{z!m1l
zFmLkul%6OttKZxXOBTB`WpgKxT%OH%1CsPaL(aF4>44w|$OeS3es7eZ3tfJ5AE&8&
zqVrLHHA>YNZweW>ZNh*nU`{yLTj1(OghPAGA5n@HdFe*2hUF_YQMh-FXHGcb1AIC{
z3~Y?2Ws>p@Q<bnZKiuNP)T;=MWIVT#K3YekBZUmxZdqo^(c?Y-Nn2F%`t$7OQ!DZ8
z2R~oH+7SZaopyuGVT#=qavQi`0F=)mrG4n<DC6N@)I$H^ALUo-HQe;OdAP%oE)W8E
zP=GMfMqfB7@c`kDh{-JfUS4n;7rS`*#>c?dD`RI#k6N~y-naXPIRse%AtV8e9WeSX
zSn%XZM2?=iqooYapesr14Vxn$5P$CSOeD=lBYyb@+8B~1C7q%7{>&);0-QmeyTXic
zUyRhzYV=Fxlff<>^_TN^%jQvW^63^lk{`;Jv>?0tqGe|qrmii_7b)PynRZ}E3RbgK
z7JWgbHicB0{qh6+qlkNr-8~nsR?@h&oK`Z&Pu?}0(GZ?AvHb0G_fg1}f{$l&+?Z2%
z4(r-X-u1#cA#hza<SgbI_8=bzYwjhUBLD{}5UAQy4SF#Q46D{iJ2-?EC*oCe<lZmg
zw=}bi>s87#90!Q0$yT?@QP}GmkVa9M`o7GmAZk?}^oEkSA)d3Jw!N^@cT``*mU~1*
zfb02u<HH@_&+_~_V1hM_US|tML%zx7qnJLTun#GgTg#~8)uheEUAz-BEs4PgYsW{;
zOWYACICN+>T&OP&UnZ1J)=$OU#|c2}o;$%?53RZIQhX;B^RD@Ia$rwX+L4SvF|_>+
zE03v-63vyx-asVX<6Novu&JA#Q6muCg$bw)OXmurEs?H=wA2-1Tfs3>cKkg*{X#2N
z%#eG2M#<X0bx-C}tkd^w^CgKgE|25KkG_CSeo&)hTM<Bemnlpjh%XqzWh$Na#re&6
z34ixU6U=$zFM5+}JQ-YgXHacLWMW$l*~*TEePQe)kn%YTA&K6vR0;jQTMw~!*%vt(
zYd|W~jcnVF+anLSdF<?B)rSS6PA2-?^pw<;!obQugsFe~!3lv^=1~V0LYlsa#UNS`
z`+3R2I>Guk$HM?T%infdjb^|Was-L=weQ$(%cNzNf#b+>GFXni3)sycx!q^Cj{!&6
zze(%;m(H+uGl1Ju%-9^pUc-&%9Cw_$m1Sr`cO+!egI!u2c1b5KRfMz|Xu#`guJ+37
zUsXA8x;S9<LQWYh|DO<2;zg`j?y%No$OB&7QFuq+m|ExUjp{LbZ;u?g^6S4j<Yk@D
zb(~X~QRsGKe29VYz~@FJvn>-ZqQH8CC*@)GK4OGHg#Ai;G8!a(BrJZO-g7I9SR1d>
zy<z{z4{<~+MGlna+$*dHl|Tl4e2P&>){vVo5AG$;rj#P84FkyI6+f=dJ3J$5cO`x=
zd8f1(wPVK)GN(Q~uWETpQ!C<p*2s$f+sn-f*Eu*ZNrb&n;d%jNao-odjQ@3J+utYO
z|DDgn2?{_4b`{3}Qi>5bFdqRat+W9}cd&9y*@iwpklQza4w<q?R0ND$MM>Iuyd0lR
z?y!`kZk*g!*9TH8zUy0->>i}-FBkwf@y+_V`s>9_J8E#E?aEUY$LwNVCHdW6D}S#X
zvgP1<kXkGCp@E$}683+&spSH}p;uK~2<E*3yUT#iAx>}>_X@FNU8NbpGIh*;BoT~{
za^JKQSLWe+yI|^NQ=F2wC2N8O`4r8xZ>*PM7s0$w20Cmb)S7p)USwq3cmN)#XcnPz
zJB@Y)j{4m3@4f2iCq_6g#Ch=2b9^4(#;8f>N=04BKh}C;KmXD8;%xLi(7Q5+t%7#&
zUvS;=IO4x2#=qwJ{Xg343^%LC=11NjZU?<F{OE4RSc=xD`=n+AO}s)ug~L9$oKwcs
zbTQ74k27b->N2uF9peK0rSLaD*)c#{FapH{E(%lnqs%ip?ZptfGgrrR%tGSw%j1nP
zK`FgR<Kc@lz5Dikv39yL?~Y4C0JKKcp{H9Om`<z^-ot9YZB=lBdBT>B(i~7?c%tD#
zxK<<F{KaoY40za=FU*q%E%HBkj)6tL2qrHoe<%`MyF2P-<o45M03dYfz2cC1VYN@Y
zqW9a2)A#&=e>QVBsK-jcWSNwU=LP}hI)3cJ6d{oSCcUqnJjT?1-V^(r?}f9sQ0(>6
z+1()^|J-JzH>}V~HeVi_H?JxaU@Zv$Y_@o6>k!k11w19Pu$B15Rmz0;a;x)R`bY<2
zw}p?_TgCS2k6-2njjcFW+h<MQ_sC6O6Vi@f&5aPp(`@^A(MtUo3Cuul`R6j8&L)vI
zLGAMcK8=cgSk{|R?>PSNj!vepCe4dblTQV*6)ziP&wRq#R$ja>`p(FOMqq$Z1qB&N
z1aIWl@^}M4nuPcVOf5$RN;qJureqI|Dh|D?w00m~7iSc&PuEx!!9<?LbWKi&w$lt4
z698}S7@UTTLBN=@6qA9{gG(1K3|Co;`zUmc%&1L=)EMx8{9gJk{ez5nivnO-zcKDR
zd6kXEy7Y~my71>dM=Lh5Vi6u1+n~11<<_jyQbV=3!Prx|7b@vO&x8Echj#3*ft3e^
z$Ve6(J!^MdzlOHT5Qk>Aj4>V9%I6~;R3*ru3zR6u0VQ+wx(g}VHdg203~#&rie*lX
zu^%6}`G)XFdu?*PEIdEZcl|qaM?V%7iUec%GS}tBIxvkVn31WOQOI_?dTn?AcJItx
zV@(ZLrAI#1*04s6u0NXfR-I0Dz7NR%%VaD82`SjNA_O4+cC(8BgU~C>)u9QwT!`sm
z_xm7rcu8egC+2v&K2GM93Bvtcia(yd9(qt>5o?eugaUvuxM)C$+7v5uk??LDDmq&m
zKSnXBqdB!5Psse%=clF;E}!`3V<Cf1(8vAExa1h)TB{UySTecxxJTusHp7?R>_Egd
zjWHK{R&=8>_=E6do_4I%{N&LALzj}P?ngF#lXR*bN=ozn8L>(dFG7S(1zuRG0ccT#
z*B3nL$vu3h#weD}OQ$s5??S+IKe;&$Hp_dJ_)jhH`!h!OW;ou>Fn4)&`c~5J`=$L`
zUuw@iLN-=j?0Q~O+Fr-(p44kY#35>sZOB+64RBt$gBx0n4n689v!Y8@Cj?Eg#40%>
zgT#wO=?Q*!eoC5(8FncnLeFLX;$d8&Wyb&NgVa)}8ssJ`tH$bqDp`0Nug87$27=G+
z{?0>FqJ_pskz_s_gOLRvQPcG2zd1_4z*&MRT+t5hJ$!p#63|RMvV2AlFWG1o<b8+~
zY`EZ|{%)I<9IYYWr$DXrB8fzI2DKRX7$7h(Icffk3jBznveJlzoB`cuCOW&M&ATMv
zH17fP(Iv49i>tSjl(nL~bJW%|8u=MTkTjgCy`>KgGBBAJGfj{Z9{wIg-!PFc8&6ut
z_?9jt9-9?-@P<TAal2Xa<q%elSdr3^^6gk3!ym!|oaq>rfR&i0MP-f7*I@XY20@?q
z!3n2=W_R=H<ZnHPo;;HM(sfA6Ms(@MQFg77IMWgeqQ^`vrP8&hjuiR43--TKZI!^^
zV7qqM*mOsKc#UM&0OwT~F0qZKsU{#;*q6^dQ~_!1NT9WKcvK5}oP{FPs3%uZz7a|f
zJ;<YCk}p#pQMaG{wZFNC^$>px!G|H_R{RLxLIfKzLT$!PcU!2>bYIFK|5&oi3E6or
zyxCqdyoc+Ja#q$kFPm`1IZu_BHA;V97_Qi+fUMYn#!o^fHXjYHs-DXc>epDy&bfUi
zP*bPSY|FaFGkH2pEgBq4q0K^2%9JuwBzLN`m=fhjhRYSsCIy(L23CCcT5bA<>e4el
zUc6et<~Q*tYa~r_LayK{4MIM*>A@+C6nYeBkA6uSaDeXHq@n^0o(TmKWIP?Wx@GLv
zG^P;z?s7Y5)lW&ed=7;fnfYU<T-;Fb@6nSJ+<q^&R9|y_vx~{P*f?Y$`y_bI|Ha?c
zl%-#5adM!56-F#HB0o1)-jyx+=Sk4tCPIJtXmU{VbI#SDrOk<lRUK5gdJWIfv`IzD
z>PD>;#7_76N^aqq$tx!hlOkY3F|U<E1<~OY#2@)p5h9sir&*nq(+qdAT)OFz+5Yw1
znJd%JdSonTmC{UaTZ2x+33xPSA6PL!yG}CNYv+TQ+U>5by+h<@J)^9MdQ?_1dhG@^
z`7A|QQdxIa5R@c8*nM)6A-AC2^l?gNyW1_;9(Ot2^c&Vd$RtPVN$Dk9;>1egt{rNd
zyZ_XBUf>UbGesXwH7%~9cd+nm1qa6&!&tOq{<q-5VFkECWv2IwXRLMMBuey-!V1<?
z&<XF0Uj1%0l?zlf2_5bcVo`>drZ``fO+O@T9X!qD;z`-(w{#erXQ{$PL$Jl&m@<z@
z4I4Y^XnmxZ<dRg(g?!JkocNEJ>n#m>d$GK;GTXHK#1T_dNZ`=P>nS|OG-kyV4ecF$
zx}VmUba!DAgRIXK9uG8XO2YgCs}X(}+xkZ@)p%vZ7HZ#;LHm8lx@ebSjWAAejMlV#
z^P|h2w^cbQ@4a^TX^?2$QD*M0X}cFyok?$N2bHA2k6Jfyg7Dfs2$1GZG-<EU%OI)>
zM-wKFpTH44&j9E5zHmGV5F==&KqL$$a0~{1NbsZ*!QJ}gtHEx4p4`gLU0g){J}F$e
zAq-^RJ6MrM`yS@PnstLE?C%6_1Y+dhXPjB>>dd~qRU~s)E}e)rW<5f<ViUl?s#yxU
z(80n^tLNKKu1(~w_3N!?vp0>Z=z7U`QZ$xk4v&qOK(d}U3Zr5`SQ7c9rPFMSSecpj
z`*}<#>@3f`^YgbJSwMVPC*etH-scHgZ9q3Qos1h<T?IvcYZ;tou3zZY7v0Q1V#ph4
zFLM>T-;wpa@q^3WQh^$gUy6#!hy9W@zk(BV2W+PY5;T#vA-ptFCSiLCQ6TzsIga0c
z6K3}C=bJRK6UV~{HY(djm@=S^r_x#rjy_^5aWk+6N0ilLp9HV5_6hM2M-F21Ki}eK
z6TwSiuVts~0@1LF&eMSt&`3b*N`7IUd@&bSPxkuh5^=dcLN|B)F!j+jc}H&H9^6PC
z4aF8xr5PnlwBn_jg@ebvcPi{5Mx*Y|-gtYW*Wr741m^1Vf;+c&sBo=+jb}<^lGm9s
zNRocs*JdFmoc`rvAJd}IDzvDZpmdGD;aWaN<cK8CvuBrY;%Vo>xj|^((y51<ws^co
z&M8?d0hhwgG1FI=69tRmG9S!&IL)`mw*3f0V1D9ei`WV~-qU=Z!RWI<!TkkVR$KMy
zGw6s_gZauu8JqIG0cU$_-%jtYks_Bu7uZ7g!2%l1_ifKiG4v^c#5XY+Ir6lxZv^7`
zea?6Bdr&U<n%~$u!J~8|4DljMZKb^hP6JgUq-56d`bKtpZl!H_Nk_rk(l>8A3#w1_
zsolG#MtyiVSY~AWkIs-MY&4(Ta)E{;D@UU(r6M#KmrJ}tGL*u29x3{#E6NcD!j7K;
zy@!XNpR$>`n}T=KKX}?58C*E#ma2g7Q=Zrh_AA;8>`ZyqE6@u4w$5;AWWB_?5o27)
z=*d0JRdpkk?z6#%Mox0Gs>%K5j|x<@HC!<ZJL2mbNWHbmxn6=*Z5H@i0N!IXw$;N(
z88cAdy@hOF9=)eXTWC$K{b)<JDirb3ix)C`3h&)vc5&DHE<x-GwjiqnBZPKkJyK%f
znz<G(Pb$)$cSQx1*gK!Nlbk`<ejGr`&pV%{$kE*r!6^nQf`Gn{7SI(59*1c`DFKhc
zu)eRui|W{Q?^)5Zwck5u3k8FFslM_cZK!Y>4bWAs2^H{aIL!)w8L#%!X4gNb+92I#
z;=x-*0hcgd&^U*U>X9_s_*3G>ShF!aagryC$2>4jH90iR-yBvdF&^joSNm*KUH8d$
zrEZ3sU_8)Z%_CwxbM`~jLQHRN6}*k*Pg+iHG(jEz93!58ygL7tXT)ewk8BgQSjY6*
zQ<O&PgT3!>#1jjj300a}EQ*WAt1$k^$r#B?ABz*(m6~>3h4Zo9XLtoH0q>ErCm4Pn
zB>JAHogS0?BMQT%d1V!Y8qGwRdQa-gtspMGMXPPiVIrc>?D~IY02OvK-veRiOq8WM
z{~28)bng!o;rv}G@T%@FQ&5(GFqJdNNCWDC!{^!E3S+Bm5p%RUEaA-7?#0|2S@Vu3
zvTIPQTpa{ytr{5yF7Loz(13XIw{r^)Bq6Va>UAP2mXc@-Y(5}EKRQEqTU^FD66eph
zuQcaRIhlMNzfkGI8c3G(TM_JID&SuexuCvwgfLSgpLvSrG=Ov;TT5@wzwQtJo;5bF
zo2HDsWy)>$0|D~ms4*D$H)2u)QYWViz6x$I!S|PjOR<K3!T4qFg^dF8#uIF7!*33;
zwcG>{t~4Hq1gB40UuZvKXSAnb^3J7S_n)e#V@bj<HKv^9+-wFfIAKp!z|;?7Mn?V%
z<+qU7pU1ZW*EgPx2yE*-apf_Q15D*q2k^g^-`TBfM~*Br<w|~Y9I&RrSZO6{zd39+
zJF$B&a{m6G?(fLSe8c`UDV4zHodvhGB?MNFn_UOLLDTDex(~{y&B1yF?p%Fd-}&IU
zXvpDk1=G5Yz4!L8p5dMV$RCK?DV?w}I3uVZ@4h_h#Eq3P;__7NWDB`EbwMW(gF0Wi
zq$v$yVk?{G^RI8e>q_}XVH5-FJumhrwgjsHBYC73zH2Nxr~ipgwL~hjM!bB}@bI>n
zS0{L6Mw&84e#BJV$8}S1{06x;z*ucgX37NP$plAl#51%GXpeTmccBHK<0Pyk^U_G8
zOsnB(L**iZAF8m7V==?yN5qi;S(n&`<R{&@7Z@(gH~bFZ!xrw4N$87ybhW4GX9Qi>
zO8Ute35LXF;f+UQef*T0-C8S;l27vN$g?pJA`I(h7kM2!QZ|cqnP?4ScnBQc5ILB0
zHDOrV!{AB~PbxASUNOrQGng`?T9rb!HqdqwULrWYx5Jo7t$!yjoTPU;gmO3}Xhcx+
zq>@*~M|1R7q?YM13cyVM=D=YHffz}siHsYqXdjUi8vI4vGfxjL^)WqF?Q^*WTX;1(
zq>x|eaJ#hVoxo)G3pQW*;f1>CZ@udGKT!W%6SDnD{ZsW9tJ46Xo4eGN9s8XiqcR)M
zj2Xd_%8{^N*cd&y+&hdg(|8CYb#T%ImjDQ!sP_}OnjK0>ax==iwBQGEdvGqv+C%Ii
zr1&2RI>3(N$M6E=8{-Y+PZniQK;cE>!ER@okmW>=F_P6Il;7>Lt8X_P75tM{)YLRS
z+}H4s3;%#AKZo4K9>CMi!orc|xK1a2P=9pp8_IT|+hhk}3rwzi`erd4?abLd0`BOp
z+K08@@&ZXB_9seDF#wP%A_3x|;0f8_cFOLU6rh{6WA}`EMA}ylqV2o{D~R8G`CMx>
z`!o&i9DaAHh1%Lao3tp7IOSQgww96UVpR0{!$U_$(}CWD4^)->5-YR}IbPh%X=~<&
z2Dh*dOl|1VJM>;CgjP%-3RkiC^a~oSxy(CXZWK3eudff5M<3!{87+I}Vt5&Z7eINA
zMpi8^+ENf;M8VG<++NnN+snVe4;)3Q$Jk?EPwJL@v^Ut7qV|w?yX8a6hwihEd&|J!
zJkro>3KrWeQ9-W>3ovkUGaK5~R6>1RtlRjZ;El>p%lF*E1i$#Xg;gH*&ObtM5zc5h
zR|=FdiJc4mbdLxMJ)nHcdW<;$1x-nGG&ng>F40Z1{a<@5ymF9k*Q)W83|Bn$qj9Bf
zo6c#}dE<x^uj(#7am-tn728ms!a)ZmS}N(yWwZ*4ZDgYhjo6uSxeV#yu)ppV3O}Mg
z>o0@67T-U6*g%u3*T8&sE2(A_aZ3YIL51VpQxda?iwk5FPjKcGF8No&zG20F^aUTW
z#`P@!hP}06?FV0f(2ADgmI;4e5PmVCU$X*n(j6DZ)@M&z&~SvYqIN_Kmaj30u6F5*
zVa6~l=5E#2zPBIZG$i!SJ9et=d?Ne4;^FO*nrd7)76ldxFZP?`CO?2pYfK$_UJN}t
zHIV;wS*O@Bz{!E93Ut0IMd~?d0u6_jE@7O$Z}TTyD_cJf><r@ZsNAqIiB=5`^c@;y
zGB~4`95tMj8GTUa^@EzR2kY&(FUCFHwqY*y>fnjQ1AW(#UT^lDRP-R&S{96)yb>Vx
z0b1x-307>~Lr9z&MCoP@9Q4(YupNA85p?%jVr?|{zEf8FkG#y_DC1JV?*bmw0Dz55
z$dxigQBrm?EosQ)DxG1Yy`Gn=kEM4u`g?2&KDm-%U=Xuf!IO6KU~J-B$0n;RYxx4q
zuk&|tpa0MdY3EVVlRKf|Vs;;#uTg?bjL%W0edyYx7Cor^RhMFAZ+9mlZZ>1*sr0_3
z;ZgSaBhxpU#u-;wC2SGQH@R(i(8<}+P*vAoO0@oZH_3*}>-AjFo3H#&Y`^;oX<9zJ
z9h_&Q_x=il7(*0h!<koTCHQ2Fq}OHVz5S&LJ5iMz1LR5L7#^=Y)!V`Px?>|G1XIM3
zvfhpuMX`70O(BzWB@TVch|Dv)xAJ5JUEc7$oFHI+H00(Dv0VX1ZTNE-dG<F@&lm}?
zKRdWl_tD?>U7%G(zpE}r6?`8L8LMz|rf+{cKK|6o2C%2j`w=De&t>^pGx*{5{mAD;
z9-t1&hCxwueq*3!ZU*LviAJLHn1h#(aneH{A9?TTozO@Sc;9!JH}H<a!~BQ(4w2*{
zrsz0HuZ@_DQD8mIwj2R>72N%-y)7#5Cj;SGKa@e||M6`Hm$xeC(;2OozUR`+MK>s3
zQ7QfLYGm6{IAZ}qZ8<`bokZBL377-TEfLMHqq%^Qr7H=KBggr}&fK|iz%>apW_-=|
z3V%MVP0EDRF7<KEr+L=@f!#)7cQuBPK8bIJ9+~ZQh7Y`~9mMo|DW@OqC^>fZBbe=p
z-aecYskBQdcx=jL%^|+T?M_Wbs?%GY8yD^P7OH>?o?@9S!L#D>-oi_6*WwS`a-6rD
z0gmA=j3g+XgpJ-Kv6!@_VP4Mkgy_L&wJG;>M$_Zn0!Yt8c0ZC0%z2Hj!4-FzDZDHg
zQkZ6(O7vX>&T&ubuOezvm||RHJX6#Z`HRTOREV~XmgC5cjv&*7_g93%xOXAFch2AL
zmq}^#1D28>kne#eVif403bGzEcW)`970&!*3k~L1ZdtvyK>6DC`30y*ZL^AdI&)2b
z`}^9n+e00ZYUh3L+;J-ubG`W=I=&6bKmXtARs0wIjQ>XvukVuFPJC%JyEPUR>-_>>
zbNkZE+9X$F$!kgPcDBmEJx72QvN1iExqF$h3yPD^CLneg!rp$cFmmk*zp>(4Zu@C^
zX*%uR8ZOxoOs%-NK`_>brm~z|NIcQkQ=in`SDC3ess!J-t{~tyD9KfbX#;(Mz5#4&
z9g${(=kmI3OS>~mXx^5yWbcTLL0^9}>aFB9v+V{C7r+p?CAY)H+24U)7Ec6%l!4@t
zCR5noh;oa-K6GAMLkPMxa=C`Sqp-x2zoxgUqN<_B?IC5~c<^Z+jU3V69GCCmnbCLq
z)~A1SeDUJ%Ao8*$p}{VU9F#y_hRto}G)*0;4V5@sUt{5`!)Q41NIK}6gSLXoWwpex
z?V5=6K1j)SQE<x((+vmiEMpp7@Je<Q7F28ejuu~;Ci##82;0MXI*gJ1_NC6cCXU6O
z?uqZsWc+~Pn940z+cMA(*1K$n1-b`eY2S98LkqslBdiaMf{;xod95#W<w%s%@sU-i
z?ek{hV@;0%=PfsFO2CKkWn^UwtfewtY$9b*Ph*inw(bv3AZ5k_`jz!%Uc^_k{kg#_
zCw9*BjQ&V6dvjqfe9jfMo)jVjMJ?qsZp5qL?o$3X$mC;V0XIAzzY7Bc1>!HH7+N2~
zzD1igqN04b=I$AOIdvy?^Fhof*2X23+Sb-S1^kO+%$a7O6r*Mf64>+aS#K#7G3;Fp
z*qD^5k-AmNs>znKPOIUwjqJc(K`toI&o6zy?lIzn1S$AL5CU-Hr`iy5=#x|KM$(W3
z!*@j2%WeRLGWm=Tz<t`&(W7G_@=n-frj$fDdiB8Vjv`Ptvr6!8s4+W64U{+4H?Btx
zie<`^Qev|D7_eyc(Wy9HgL478<y?N)0RLB}A7`G0%N!)_=OtLO42JQpz&e2&gvShI
z6=HF)p#A=oMqEbmE2hQZr^UeLOPv*}WAmv>_ysxrUNsYOsW9IelaT5{nl__`l?c)k
zIQj-F(@^>q^DN{?x0wqlv1><R#WUZ`9UJRP?e#+`{k)MO)$&8Dy9K`g(*UJM4gKNE
z28txX6g{*`A}r#!&dg(AbZ|jCkTA6L5xdEqBTUmwg&!+}S01Q3d>%X{n#PKr7X99i
z4tr?xWlwV?3qWm1&yE#!!Xu5upbh#gxu%2H=m3=Qbr9Jw^lJk1<MUY0GM`gu8b2Ij
zTsi0X!PPza9_e}@3#-wNi2Kb^jbALI<VNOJ^+iydrN*F7a&40Yt0BrTO1^?$+DvEZ
zJ%QuNnT8Ip+U;#J!gbB3Av<ykq&C}veK2Ob*m|9o(k`bpjMnnlkx~f#zzQGSbCGc*
zTt1pSPq;B5CBB#I<%bmc?X`y5RVAO^!cMg#P{gr{7H0)k;|o>_!W|atFw&kIEw}57
zdceeK$a|9T#w#mn(l9&ZkoUP#`3U`<z?Ck^MD1-Yc7MXbJnIE>dl7pg6LbX-i#-&<
zAOe186;2TAGoBQRn$9#HE-4yS623aWb6<wkOZmp;SWBwmE=(5dHE?kpGn-ncy1c?n
zV#{|Vyn@>8>iQ8=d+2Ttbk#cFJDIlW*<Z`~N~NeiYHODYnAss_^Mh*+Ihsv&qk7S~
z2oerO1Xz><svW^+=E06Gx{`8XxTw*><u`}DjBvrZ6D}Izb-8!A`ojcGRu?wz3dKwT
zkzUs=6HGsXW(LH7>@Y&~m^fGZNHkl1p*|z#{*F99b%@;^-R0*c{q)q6QNxz^T$L5K
z6EPRoC8(}juE8mkOy=(0EzSoZLBHcgNmzHgex`N-*0pT4M59W4{B_9r{0^5RZ3p^q
zexP0};vWtLswD@&RPYun6|)T@v|yzVz8GAyb|GXn<J~jXwI&V#NyC1HJN+VJ37|$h
zFzr&L(Co!}geN`6?_rB#zR|7eT-NVxBa!^LMzdn3!T7xzJ-9brDfNf|@}A0K-s;OF
zC$oc+;);Gxx-1U_U1m3*%=>|@##3cUw%nKzXMR=$MuT}~1nBhDwB_)moE=%N%xv;O
zBAfOb+`k{<>!XnNWA9o1n7fKW9}B}(Rl!-POW&GGiNT|`;9=NuR5IFxk`iU4POG90
zeRbFvI^=lztIi0lUM<N)1gT2iCVXRix@C>+W7vBHU`Zq}V!z<M5TO5vD5EAnCBsDK
zHV;lp(lw&i9C7bE-_0G?&p@b7MJio5wO2sznn>yzXL@rC>$%Z>Ht1jqH@f%-(2J&o
za%to3{6$?~S1d}}U(eq`ipvl7)Wj-uNJt*){_e(OaQQJ3!4wMrLbrtuf+4LP2@CBv
z0~AyUezCDXz|kO7X?${SwW;LezBNZBpR|rkBdV6q3tTN^1#k0CIoUEyKuZ_kg}19h
zx#Z(6T||+Xu%!TN(=p1~r#gruS6((+c^T|7OmH7CA5coYA#`CMn~uxFlZtXVjrL)N
z;O~PvG{W#aB{!Pr8nJnUX<fWOtlYnkdy3b<o+_y(#FB3u4(U3sHV$x}8WvE^mYOr>
z6H)k-i3cv&ku6L$>$zUX{WW6JI#K__YGsOz>NBJ^#(~!xtGS8o(iko7fLF|nbYZ12
z{lFuAOlY%p?a7k2qeF~E)zC+b=au$r8N?e$8X`JQ8cTCzkoIhCQS7MNbP6azy|xZv
z`ium>Y8Y;Ri}~V!WZ>ch)HJ@&2L|p5eRsjx%<}_h)O2dw!Egz8#U<9*8Y&_SB$1$c
zy$)S3!hEVHv`$Eq(??=l*dP0uNT1TNxyyEesp#qzr*T7Do$ZQ}4+ScGBwM{xF0z`i
zw-94iWYc0=KI085YtRZ5gL$;Hd9S>ePBx~`-#8%K6dNn;<GV})Q`AQ<`aR={?=e|n
zP3%CMY}vCvoo9FCN`TY<98H<5oSfee%-Zg^U%vM_^=)v(!nqodU|O81FkBqGK0K<F
zfKA`K_0ya?G=whEg*jXe@(&VPk0w}M7AhARKr1@ekbTDLsrU!0sHVD0=_J9K$}kSD
z^zR&8`eHOZwS*35^g>1SY=->^!aXH=6WKPwH$st!epxj*V()}H*VN!uyWZ_Ac|Cro
z0-tfQ>-3~_c*_=!@g0)hvI%x$5FppBP<F4JbPauYHtH@&4g`WD+wV-8UDy5m^p3On
zbWO%tyL&xqX&e*jHd6dip-9#KE4)UBAwt&?B+Y&~oV~qlcscLOGLxU|m{4-W;HmiM
zke3r>(_xxP6Nsp%7S6*Pb(Gw(S%x?36~I$0$Jd)t^XLX0&SErddy+4uZBm*x60J0>
zey!n%uS;EXW45*NN`>W<<fEGLS~Y=sb`wa@Gj&6bXOj_3(V(e5Y7ZUsRtN6N^4riR
zYYOl&celEivQJt)=&ZN!ThU!yKeRZ|c>CS$s7q4RH*e6pStZyT#IYbUPZwe|w>1ze
zZpcZ$1$otfj@zHNBlE4tu@=D}z1}xddb6a$KYy0NPx^A+8fx&s4!)P;+B%5&f|q4t
zf#P~Upx)tKvBLAs$D3)~{u=Gi@7qP(#n(m8_kD_=$f6qd#*rQx`ZdPuJ#UClK8>K+
zP$d|ePzt$^2N^64P)qNO4!c{)65<SX$xax?apO(0B!1v}?I6lN&B{-H-=6JlDtuCy
z>&Vw^8Q>iL3CHB1OT_=6=|veT(>V)8g0bx*4$o>B&Bmf1kWP*Sk+{Nh`}<V96CV>F
z?rGiuOrg9Ob*ws!CR%-%y^ksT9x?<((nebpS=O;**1l>8Rh0a7|2fxNJ`|WjAAVLF
zrXHxW5FzB~i0j>a4?ns9FVkyB#pTi*5K?G8x<p)~m*8W(5H)~kMCce{WX2u;#Ekz^
zK({Q-C>~ZHicPQM&tj*&^HCgJ%zOW!LP_|Yr&EH~PR+x%O7x!NUMp&#(6QROz?20W
zI>M3tJ+}fEK1jZmTN<ousQlKw_)Tl!RfkWgJA)?28oQ?`w@G4Ls%O(~pMaO#+t>9b
z<mKqZLUplA=t1G8&hqa4S6p4UQ$zqN%KP;-4*k#DJ!}r>y?FWt2$2;;{d18afc8H6
z3kMqfq_OB^6ttf@v-SdjjO(CCij+^2Va8D5c#0<jKQ>FrN#D|9_aZ?N=k9cUSb>I$
z!L6N!C0ZB?>!re9-ORk{dHf3B9F_!Li*wC(bR3*=G_#gw`PlZiNO$3d%<$&*?04GE
zf<&r!`pLedG9xKj_$b*cu63KU|2a`cY+CX4FOb+ig#Q2yKb~UjhYM&f(Elj51fn3#
zWyo$ZED}$<SHmpSYe6t9D{(A?1u<s$S)Sj6|C7Zu{<yB#t%Pq`20O2_cb07hIic@Y
zL}fCNth}nna(h?8SJ1K)>74Bg9GV$Hzd3-Ssrbq9oj4Y9B%O(Z4gguJWXOyR4@XM<
ziq$3Z2fzAYjW}0Ud$r*G2Q9+7y^WOQ_0%xe)@_xA=LXXrTRgsS*^NN^B-nAl&wJtE
z9hIAR<<lQ0{&DA)-ANHMG&vpb_(D+^asl9iEkME*K~mUHK`PhBs;)Xvg#d<tpCnqr
zxV6S@T`#bG>1S4A%uMnMhw<r-u!YLIYAN6AH$BDj&l~pZEAvR)`ZEr1722P03Q_}&
zi`;}j0%aM-G#ddlx&0QF{I9Y+IFmg%+P%?QNhu%9RLpgMAj<-NB#`lmh#E8Cb4nJc
z3FQZ7WZh+4*%Iq2bSs*`x?2e&(-WlRC+!l$k@<jH!`vMSv}=l(KA<wwAmgH*KuJ@M
z_G`zFtWa!gYr`+TPImGASq{p&)dN2GLxFUpX)r&@u9X;t<=ahiYu6C655~5g8@G0^
zcZ~?q+F8mea>wXKUu{O<zHiUctoK>}te1C!Z^O3ks#KRg0d7YprhJM?b@Gt2N4Znq
zOsQedY;EfE@!==yn?hlBz^~p`7S@`h%u4$ct_7H84fkO}^Ug+e)~n(Gk&NJjx5Cpx
z#O%_}D}3abO;%%!kceP8V~`a<SYBP4wA^-F-$PD@#@sg6*6%bUY|5)YA%Dv-Ekt*f
zrxP3Q1L#B_uo5p=@r~-cp@t6n{Mh1UA<R%s{ijW`>B`d8d|#Q^W{Pl)$7lZON|)(~
z*%^seA}4ziUj<sC8ZE>!YRm&2ABnbR<u6q4AM~cKf{GN#?EG1mn`IY6L>T5`cDGn%
zcUs$3&iw+d`1|b)XAvN$Q8RU1cRFnxj5DT+RdQVFO_PbX%}G{ri^hq$4$MKw5rEtk
zG9qabw*A+J(WsZulA7f8A+$k}?491LkFJ^%o<+3njOUd*Yo}sZC@CF%7;zKpf*Q^2
z!tchrz#fe0F-5t~Gz>FcNuR9PBGQs;R&hoXu7<E<-$LK?x2BP$Eg~#so;3#v#DCyh
z$n3#v_j&UJU+JI-HkSw1?Gs71S%_{gwdlWHnR;k@E>k>+;n`Fz?b<*i#8X{S%ctM#
zgoic<@F?a?=;H=8b^%k8bL8B_dLAkl&chfh5$ybV*>+Ha*(2JD3a$BU9UgIn8kuq4
z%rm7E&e1Fi^-|!`Mu^~qGpit-{CTM4ZURwvWRdQiXOVeLUi!{6bPJ_g%}tPH_tUzs
z&lZLbV!Z`iG1Qm@8QX>ncUB$1h(T&oHt%)7F!+ep{N@^!IQsQGt^GFfW=)OLcf+!t
zS{&ETd5jX+T~{Xhz)|VN=pVpNft%++^CYoj>{r}7%yh!~BinRm$`O*uR{x&d31%d8
z0w}J~z1X~kTjES#2<J~PCQCdJ7-JVoCR|7x*qr}x+(h)6`Vc6|%ENYO`lXKOE+P34
zuFGLqt|__`t}R#4(fwiY*c?vxiBE#kh3@_h!@UMBPS0*=;-6bSsC#t%E;<z4`qzN_
zriKwnBfcG81-ZkLR}bKbh=C4&#8>vG>@a;=ZoD}|fp_5-g5l1}Xy#{kW46^p*J%<q
znHFDs*LbV6-S)MAKNz(8t=i+mA|DDAm~{lHd7FD0!c0zZA&5!N(je5awEO5m4@wTb
zyIZ`hJJKCNx4(rvxGSkvYvyt4xY*T{RI@5PN8QHBty2nmOo=|m-GiKjFy^V0Hd!-x
zdH_-UMK}U*TzC$pidreR+X^e{!?jf{Eh9oXUBbur<1K*y=TW8{-Hd>TVFt5)Jeq~Z
z+o~n@+{%0%Qm&Eebyvy$La%_LPn(^TE3p5(cIlHNNuuq!KP4w+%X~h|Et{gwWnDXP
zP@{1~&iZ-($JdJ7^55^LE4d_|XP}Ud?34bh32ks0yqCr}l4941-HDc@C&jcbZ(}Nt
zDAPV}rZ8?E>#D$FJ+as$<JFE{UI9DN<^6SvKc;URMzVL#kb!8a1}TNgMTYGO&KP&3
zF{0>-9YJ=-#`o2ae;<2WP;vEr^`%JfNdKCQZT>4^`nbC0{n%E`-yApK2}DU>0ODG2
zweT;QcXhmAURqRBF=W4k%W3@doD3nn;dVP!x?M=Es*V+oz1h<&4rP;)av$UmGUX|P
z76DF3^Y=N9R*rFB_VSl2J3VC2X>apM;7@7@F33~+eyGG{W1bbRQw)!USI4y50!gyU
zUX~u1tJE@2p=fs|R|3-1x2<1k4q?7=QH?&*RBe}aq)XKD%`gVJaH~<80$c0>)<J!3
zTegvi-8wr<nR-Q^-2B`;xSDXIuJ5mXrSi9(qiCf|2c{Ba^>N$i*bXG5QvHbh$<`7A
zgzwhPj7Xm#9S5HYVq8%K`TprU@@ZE(A3X><!LM>_EV!p==a%lZd%0+30W~^~z4KF$
z=iR=PF-N`S#bymCk}515eNv&NH}z+4_i=>k797mdQh+>mi9eyb9!Y@3$Vt#Z^i{Gz
zZ&5WmM2J^A<{uDZ<p%b)KG}}YZ#hbhk635(u5PJJu={gq!w;cAFT|dn&>cno0Un`#
z+<HNQwJWZ*H5T0GpLBPBukBGv_N?)=df}5t2gt^pAHAm=S$P=I;4*Hc@ZG++&8i>u
z<<u>w&ZZ9?>bF`9J|Yf=3mG=1*)3-kZgG2NX(VAuMer(AhwXzhrDmQSCkfju_JrA}
zZ@`wmVMF`wl=Sx}VKwr(8bE@I$O2qu(sN|FGS!x%-@Fg)Kx>DK5Ks)$;mY1Ol?y%B
zI5^UJe0zRGofSj3blS~G%GmW|cd!AJ7+~1s?9s{qWxZu97#w8#{d#%D`Q84Zvw=69
z&AbIaFMhs#I9N<o(Ejd8f2fs}0DMoa=D=(kFlNVN2%E0eO*DVfq4M>9=tQI<(ZQg$
z`Rf(I{EuY^%Z*FwPCR(#avL?K0jZfb0GeC}w!Ekd8`G>>`Na^fmzaiv%u(tt&4Y`x
zLp~=5mx|&|`71e96Vv(n5~DjYEoeCCJn}MmAYGp=Bedoh`7&C-y9+1kZ@ydC(h1^z
zD#j$wMBbd<o#=OUHU@hWG1}e^uUN#F7p6q{yn;dw+wOMHwB6T<&-7P3)BVcy&|BHK
zL^YU3SoUo#;TJ<8*b<&kZ|F4%0L2Y3odl3(81j%|fEg^U!}32vL&kX<n)&#6O?Cg;
z*q`&MVkR!X&d@yjF|(v~&iT~V(U*W}qfF(J=eh&wwFetL{S+ENb7AvFZHWWR`IuMC
z3lMhdeq@O|N^R~iao#SdZ0^&s@@}8}tjnhlzPtFmfV74yVCrreF!$0@y83pp`yGTH
zO+$pv%NbvfrT^y0Rt%|TVCNt8s?WSeo_&#pcY9xoabU{ek6>xeeaT1C%J=0nb-olm
z(RV87@HwHB(_-LOWB5y~1$qRwH30|hk7?J~4Jg3xaY?^Y_An)qmJO|52OBuw)H+>p
zWMg_z*Xo+B@=|fMO(ZvSc#8{E^~CMJaLHq{R5iDkIOcxmeFHE1b=5{vCwvdekCyL`
zic8??bxug*(WNgHgM$EMpAmPx_DpA05!ySF2kwI|WI-!7L_A$*ye8tSmu3RrhO5ch
zut#0`1#`_g*YAc>X+`86*>tz5py*~*cUn_ZZd_&Yafh9ST@a-yYX2o~zlV}s$!g87
z)qQ;4`Q(?ECtZ&g<CSsW_V#Te2M38me_OdBIH?7L;q7+p!}N4kb9#Zt<rODqXD4%o
zh{>Iq{ymD-+uj0G#xS)THe8eu0wNk)Vb)6A^+zA`^{ezLynuZZb$Zb>uxR46jBlE0
zn`wT=4G!ExJSjQ1e7O^!gcn^SLI;u5)K@))!345c>|iq-@|?|};UnXSe41?O;=Z*3
z`a|hw5;tu#K_!idWCO*`h*?bko<E%x%MTPQR%)~KyeW34dr<0=0{Yi7&vDa2|J^R^
zeKbuYOU?Eit#vxfg&=dg>pf81;@|^5=f^nIS_wB;`P0eW)O_1?rxbva`mFcgvJL*@
z;P>DCz;J+ssTaGFD4nt;`=_)l^18>O4`QcfQNV?HaD$I21ZK(^zd2?k_23Kx$N=DO
zzOHRiK<A<R7?E99v?9tG_;-XKq5sN(1KF$Bg%hk{?nVKDFMmpQ;uu<)^}eKJDkMaF
z6Pu#*SoQ69AKRK1t8M0oE&Il_kVBu)pjot?mle(a!}L6bcP+s(yr3k*dKcn@`T^a9
zbL`9l&y1qbl4+j3F>x*3+k@oQQlBNM4Bh?ls(!c1#1*nTt_Kb&k5CKgh;#JjDzI(W
zSWP};+ANiyHo+al!OUnrFpwFm8)`UpI!-mr^%9pCmhX9YZprYkU-PDZ=aUJ!Dk}||
z;EEU<|J{29Xr+Hu9QeD>VHsEt+flPNc;zIXHr^qpI#l{=!$>N)w7s)h&2v{t=cmZx
zg7Zr<rk%F~Nt<><L+32J!E3$Z{38tU4<BN*b?W$*KNG=<!AOilF&E0SJJ>pOnT0~&
zrRorT*z?$r;v?cJ@=2Vb*|DoKu*bKJZoGVT{+!)?LA9TMhUBEeEGHtBvSo%=0Xx(P
zlDnKX*mW!kpTJh?<+?T+e9H8(#mrHjX|K^?msm>7rQ@8q`K7L|-ZSMf)`Qor?%%(A
z<z@W7ow2c+Z{KQia9rbT5rdY&`hl@S#C@g(E1juM&K+}M)Hg+5WxX)cy1KuJJYKsq
zSf_gz_T`-TJ6<`Htm_sRs-E4pzjTG&WA~9<eBa^O1kCu^QI~5t1b{-NrMvy+a6XUy
zY237}k-f~eyp+odvG~oA$JZ+Ue?){D`k!9d|M7NPbpsv)N&vB4YsBq^tjMIoUz(K$
zyE&e$gY_J*oMXQwPoDgEp(7;OWcu#32yE3201AkenM@HF`SSBEC2Xq&s)%meX0Y+K
zw*@OD1<O$MRTx}bbnB@-8D&2{<JY5flfXia<S;Ko*0erKZu=*@DZMUrzTS@JJtOL4
zpm}aRByUfhjjJzK6bOZ#4Utr+&U3z482m`(fSNWrL2X>!@`J3Eb(TSb*8_WnjQuc~
z*ml(q?GXic72_+Ai8<P=XoPPrq{t`<0vtHywWei&8yx-CaP=xv3tvYh!v*tQ3w*?v
zoId&Ut%(n~c1wrcaK)8lU6vsg9xx@l?Ovme79JrmmNM*hQG)a4-c?P9uk_eiWgpd2
zSiBz5BHDLvQ^4qu*ySuEg6N$7CfozKln&jrWfVUQ;+5WK9KaU-63SC}^5645|1A&n
zU%$3|dnM{z+C5Grh}u=kOffZZMz$j1MT<n*Zi^;pnErPCjy<H@7F<^w*R^9|Axco}
znWg!v)XLcahq%w0c8|*UB-~S6A|}D5jdrs?69uaf2}J3D1%zNQ$~?9=AoTv;WL>V8
zujiL0#6-niTEsS#X;{)92s(7AFVV+nANx!F<suq|VLWgszo8$k<ySvwNB{NkVgGkx
zcyy#S_YU*=o-RZ)-t3R)8yGny0^J5&C*Xv3g8S{`e1_4Xk-K%6;VD$~9$&|{94Y1P
z&aORsqQ3ofcIqx_VCEoy1qo=5w`DMXDkf`+T{7Jk-ogPo5HSga`kX}_Ggno^`U=H%
zBPrT)q@jV!6ny8m)>j@5H-*Q>92~mJfX;!+#blkcVwWy*a7b>eQF+-&x&0C_;{HR5
z#RDF%e~4HAHh%r@ea%CH{TDFxmSyNDihVB94|pEqBk{keC}i_76f+XwK^L;H@+TzR
z|2Pr+^T&@h?1sQx?lLGzUyNA%qo?tK<}h$V?nUT<(D)MA51_(;0D}gv$DQC0_xt_;
z^V)1ijCRwjV<&COUQPt;(34!1KgjcxOSbfwn71bLYzl+SdXFV#->XLhy@iODU`sE&
zP#alRgCg;uM5*Bmz|zEAsEIpMtxZBxYp$cF#tggI74y+=3O(-qczz_c#L*<g<95To
zh?^p&^Y;(G|Ko{sPcm*GC{83IV|E77+Luh+j(0~2v{N*~F{%TNj<3#ye+^ZgHz?#^
z+LeYk-hQuHs}o&CUtkM@I2_@McBhN7jc9IB0k8)-YR{SNrD%g8R~oAYV}aVQ{?#?^
z=)td@am!zuy+N#vkYJBGi$5MiORyT6RXpnN_=y@RR8`TrUq`-JRdPQjq<{8!@nEzE
zylVIJsRr#|<alN?D1{DzyTk_j6KIEB0NsbAnBN>qpm#3>CSnm_b`3s-iU1`;8pgIL
zK~x<Gys=q<IO~q<U54%~Geu|Fpt$NpK<3}DE7ow@4DbW*V*#bCcq}ayKxwA4{`|=G
zwOg1M{m4w>Up+t7i>F=4-9!L{5a<yqf;7EH2kUu$v3>p{>*Pzk2L3O<p*<56PKaMv
z_5pBFP20gi<IWx`9E}-%x*2JLL1O>n`TyXpvffx#z>2lkcKml&BY*3B<}LVTw|POa
z4bU|Cs|9;K0~Qff?5)<<;$CI5`MAgrPD|Y~5*}>fz>W{Y#~wFmg4I0djw7HRU};vw
z@qA#-H-R7%TCtGIQXLs9FyD;&wx({-Hg@;x;D(6KVef$Y)=`DSaN`2Ig1>r`{{t5x
zpDhD6=k}vO%MjpTY}q15;LJy6jKV`7jo$SQ?NM9V`pa)%9t~pi`2$_X-yGyzrjj>e
zV}A<O(&<U4eeP==yP5UBdj2oA<Ug<R|H+R1$`jALG==5-c^=9o>`5<8dey>j0emnV
z+g=E)EM2;1H_MHrod#FR^c{Jmi&)c_a;w7d@SKBePNvcER~FB3BQkbx&_*D^zKyt+
zfZAn3L5xM()BC8L?@srs_R?V;+Qjj937;{&zUp}aps(ZlIP~6-l}^lP>{qFEBO53R
zESuXSieADYBMxk<3R)~+>je6g`n1LC(MX;Ll{bP{QB_CArMbHm^y6Ub=uUl}UqAt6
z0VHr4O^`4hG!uPb<1w8sWFXGZ|JHFs{auelW??X9f^g)XZW+fZNggljD(90-72XS*
z6STf5vvybnTeyk_Yl8{RHA{S1e5|wwlkO$^ywX#j(<Sbb8A%O~JJPHg5Hw!ac;NLS
zh_(KE{&Twfg%kfrS>FF}zxM<Gl62V3NKgS$i`$2l1e#2JyISe~SIe4c5}i((zJrGM
zr+7c7myUPm@4T5|sNRN?v3%Exxp?yvrnKMq3v8zMDy@$xPGM}bOX-r6dCw|ohL5`W
zzL{cWArfDP-|xLitZlfh$&<Q8IG%JDP`L1<N(9Z8m?Wo!jh<Abdoi`>O%<`ejarn2
zK-yA1H*xhNtJwSCC`si}2>a(8a!?mNvSrF9z{_)~AunKpn894wUAT-x!7Y{K5*N``
z<DxYAC%9}0R)uTnrhU{E_chbQdE)3bk01}Mmn`+c8$*%T!P7~RiU(xG6eD*2G_l8p
z1t1|kWrh{abto%l(Vxym)@aF0bI}_`A0YE4KW~`)yxbR?QtI^(P|_l<1C~EqI;0#e
z!@^bKMU5LzdQoh7S`~eM-OskT@Xb#!uwAE!k)G%*^$TGIjudmigMyCkKH(zy)rccF
zUUyDux#U|nk+zd6WyV^+jGeN_wh~;x^a51npfBxb4cZf_UP$U5P}pt@eULJws<yJu
z7RzPIfG3RC7k`85H}app2|x>@+`t48eB+m<EpXP52Eq9@?SQCsUuSZCGiMotlLov#
z2-O7aFAYo|IQ4W!up8nj&}AT@LuBz2D}HmZ1$BvRll(2y6l}}!|9}5FBv?#2tCd4h
zTL=?&|Iiwlhi#b!Rx{{ttEryJxCT}~8Ex%1<C&umiB0KC{^ocGjD&p7o5%sA-L_eJ
zX~qbfmlp&%H7MOcgDSY?z93#L!5`<{C^*%izN_@UdPm6}W}v)X@=)Q^bB==7r$gx!
zq#U$+swl<{w2hNcwzQ>GPwnH3HvwYL-<=ws0Wy1`=D40*ad08Mf%pM-H`IlC0Dr#}
zPPQFG5QOQh=Ok+*$81Rpl5W?F`D%J+PI)QR!j?xOAuhIHFW9uGx#qpqbOM#rm28{T
zEMLPV!A6W6s`A!6+WnyZG|!fS<<?!?7(5!mixFe)rahM9g!HCLG}^LcYqUHzk3YM7
z`L*qNg{X6!`qSh()mv%zPF@+%4;W(m0Xs=;Hs2a}yPRhc-nlZs!3;0tOJ}6)4Cb3c
zp)CS9yMC5dVQ$1Is<w^ac^L9tjZwo;2Y*p0rW;GM&5g;Wl_Fd<wvR%dylvTAB!SJ0
ze1(*?%%D5IS+_o$3ye!BDZC~(e}36GpYEm?qWx0K>NJSZw?Jd#pHwgYznAC#3*`0h
zbbS8d|9+r7esd`Jf;z$(<hDO|)O-Hh{|Q6Du%vIqjg!Pycu^eU*Kdy3Bg-Zjez1Sa
zh%1r!2)L~XcJ4RFIo#yGmLPvCMgHnht#1hgmO>CoMjq#G?!uk;H*JtCt}o!nPGFxX
z{}tcu5(@)<^S`IZoq+%T`u~E)$G;qlf1%g(-+InYj6&SlrpyXT*ckD;YL|kY-kG~L
zsWEHJRrR;s{D0$-i^lVxDnr?kg8uW0^#y<B-!)_X{~mQ-Q2Gxo(#OOTfAA~+MDP^b
zSlhC~wqC}Qo*~Ov%rOn>Zu+BkIk{o=Ck11uaq;D|aWQUH)vIo{kO&`h%N9Bs2`0TI
zmS*j-|BJo%j%sq-^Tt6yq=`zG5)}{-5D}Cj64XNz5fPClL`0f^h)9!=AV`raD4?M9
zrU;QPMMAGC(wmS1(jy5aJd$$W?Ong${O&#H{@!`#&a9ca@0vectR+c!o}K;dZ~K(S
zfLTSK?Yj?DxDeBNZ6kKK<}6I!!@imd(SOCbc0*vC$Jq0Wghr0S{*pW=O5?_#F(0eM
zqM-{o5c_lu$E^%ES@qE%^ilS&tr~;)DAIwY9|lz>kW7!>rh-Xy4i6E>+_w#?Fhl88
zk$>`+BOaq`fONtcY8WFF?Qw=7P3;+|nx&nj>(|D#wK6pa2~Uj)pB+nEPi5BdoY;S(
zS1jj)rKNqd1R=~9{ih{r#DN@|E)vY-!ceve33X(rS&de7jIK1f_KC7g`Eb<~ZWlsF
zt>sY5Rn0>Q+!ar|n@>K~J2V#(wnzu3PnaTjs()+JS;mpg5EK1OE>N(tj}Lh2m{Kk(
zobs~jIyhe{-V{OL+ikWF(UVarc6B?z9QIAG$bXbC;acL4JJ{sJabR=k0p5lLA9t57
z24{mi!ydzhtL%?>yz#!fkkJybEmHfe8^vf})#xd|mlWQ5h=rSmU>-TkcmiKM57mO=
z{wzpq%)4nE0}K{(1jupdzXj6{puYllMj+oB40cc6|GfOy$~1zdxDT-1YZs+}zmFD0
z<3i1v4;tSG<(e-WSv-QQhOn$?Jb<N=jS-V)u0&mcf+<~XsguvgEh|S0yrk?GLc>WV
z+?4{&mbZ-YS1T{TZ5Q8C)2`Q!mgB6upPf6&nPAlI{7XbrVxQspV;#Rp;Gx_bH0)1>
z>*%fTa?HnyU;hwD3Pb-fAl!dzSE*k>xPNb``0MxoSBH%#x<mzN>XI)R59~IkbcKhq
zF<VEsmJoZf_+lw-MhP6BT?4OhT|G>o@e3nQLqYRE)>yZGL)@&gZvDl%LU(WRV`f3~
z5gT?3zv;D!S?IxPuKgv@nD;1M$w_wEllMl4RqufLHF)&{?9ZREOq$S3UvM&UO<eL?
zJ7r>&f5X|~_Eq>UH`e;UB0}Y3R7V^A@X=eQx<RKhIE*F@Tr^ZeSrFTAe@Wgy{+(r0
zZ3zyE^8?zy*?XxdeNeF7hRz+$*_#B5sq$~AJM0*WI5)556e<)jH%C69xbFKG*-s8R
z%J%9+00BEbL{%ChHCoUN=vS>wcef%k$1V(Y^_?y+#!IG>H@-*08vQhRg;F19`0G8~
zrVE0DtcNT>W&JRyE5Zl^?H##}HQ~5Gk7g+ev#L8|b8}oWlcIKZa}s@rR{cx7JW-}A
zh)N8g@iDno#YW-y9GGBP)4sq1bXTTOQJYCt<)xdb_sg~)zHKBo>iD=wX?_lY(e?nz
zNhCi_a_SEDz%Maf<ON`#s76f`hSqEtKk|w%k55bJSI*fM?&{)};IQP>6lDlty!p}$
zj%{hI`h01alxx9Nf+K*F$wi~d+cl#&>2hRh7MewL<St3%%5aaQ+2oJLiL|pu29{L3
z)eOtcn^y-Mp5MwX6>{3$8YBNpbK*~$mH+ByxB85hf;%B5)TlY{pP4aonBq`Cr_?Rd
z#+I_K!ttk0u!cHrxwzT5t|RF?S7S`&@;Gv!oB|MBRMBp*@l<0ZNORd^puTs=v$aps
z9Rbp%!=k4Q(*B+(E36P}>fRX2TP|=^yPQpETVvH3lp;A56ilZFe7us$EskH(<)C}k
ziw+%Xd~Qqt0ym-lPt;?9mEU2fLE$qS>tSEGF%zdY3`zF1BS>~ewXT3YQj(GVKy%AW
z`%;hlJ#0hc&<~&WZ?>0o-JCC<=xgcR0}P2MV#uQyJ{aDcn{*d7G=TWr^^00K4sFrI
znO`S?%+eZ@`xFKqoo06x%gvOrkIy1Mwd?4jI@31oFUmGwHCy{=A~emA9*fY_)U}a(
zQdk!Dw}ZsZzd1M+4rf04zfD{JpX`|Y2{O*`+)c5+Y{utfI;tE))=hcDe(uTA-sAI+
zKfh^y)Z9B)c7Hcvs2I}>QWCi$`hkWPL`48e8V&Yu3!|<{8EFZ55JONuyetSmD)DOv
zjwcb|!V^z5Vgut#)U{(mb;g}jCPG<#A~xVTDt{Uu)zxP5k)vW92i6+R3T&3umQ*CC
z<I=9klzM5Gu_p<$cjE7f-E(6jSDJ%M4Q%4PL-i`UgRPS5YKRR|ZPK7%dZ&z}3yO+`
z0+@$~7>Q?)7om3ekDw+D?jaD);_hSeH838fnWtG$IG-@(J+5BNF`Ghq6jS0@d<8lu
zSe%j#&RkFFr;CSx+xhnKQK`2KWwZSB$;4Sx`{%Rc-)><a{!&{uL4#fzlj|&B<GmjI
z+Wy||920E3&UvwD(VnT|h#o4-Lsm<0Jc=hmGHhT?-R<exBV`3Y?&{cKKJkmr>1Lm6
zURpxbKe)n)pT_`kvG3p@fT3{Cw^731M9X;IMI-g!J?*Q_0icd0`@31c<j24(M5|Ub
zS8qYE!tV08-RaaFtFu1?J}99DhGH1WBcOIwD@0@EnBn%aC*ZHwfcN(=qiy!{AnWv`
z9~eX5#T`Hj0bxXf_4)MajF)$10<e$9R(%XAKAGpI&UGfeVZZCx5u^cLN_ByvvKMs^
zGaBI@`>AUQS(3L+J?vA>pXIxku~rOvRWi%O1O2_t<FI|*@uqjsEe>0CSWf^2Tv!D)
z2zyht_nFp*q8We*lIw8YPWdpiztGM&5ud#DLR$l-W17H$DnVsk+*<GDvBk{YC(2Q;
zfD!VlL=G5Pa`wWi14ym}vx$>k&D%4IgP2p6l>V4a9M@E=n&=6$;$&f~nZEU-Gv_y!
z5a*ERC?Q#GfST-ZX82Zcw7Q96?9!1~@oD9+b(UXt8{Oc=Tg^q$g}s^jYZDvHKDh`G
zF*Ptl{FA;+Q(PC0s<MpUxxAI0T45h)&PYS)F2bu-R<{YTNm!e($U*z4*v)Z2QRU;H
z!Z0?R4v9#U9w&ZWedDmF1FfkmMJXcj<AHQJj!^Uj?NTseWfgU9z#z{zJ16A&km1S1
zS~EA!PqGPo9X3H`joMb@fG6X{Lv^9(Rs-<6G}VNWr{hdfG>UBQcnkTq+v2F%#`%Y@
z5{-`EOdw9QT&>f4BPS}W_o=gBwJ~yFXNUEyg5po-LAVzf=ciE{C|wCU9yBZFQ05`X
z!ZANEVcUcp6^WF4Ymy(5+S?wT%p`hv7<#<9D|x9rDDJxK8NCkFTXf4&uo<U4s3{N^
z(ao<P=t{#w={CjCEq;NMN?s)s%}>AC-OzqpJahBIZ@!^6&&yXXpR(}Rn|0cfw0uE2
z_iRy-(nh6$9TiFUWIRXTHJhMOZ;^!)bEX&3!l&#M`5Zzn^F%d<oq26A^Etj!-$&4`
zRm&K0QfoI}Q#uC1GI{tZxY@Bz*pwXs*s`xt(G<&tu&B5Le1>)-N=K8qY_(=?CTXNz
zsn8dy{AA%T8N(K5wa`T}p-Fha%RyCGOErY7QHMRG&VF6fam<@pDk**YvTAkZdYE<I
zy-ydl4oCmAx+izn)cJbJh`yI&ryRAvoVwQ4w=$VBo2(QRV<jyuEn&4D4+5w<WsT&5
zCEIvM;W**Wn1eSUJ4*^8DtFK^Ak(be4p3%f+fRP0Vy#hs+Iguwx#&P=9P2R-e?(1T
z0hopD0V6T?oq8azl0_7*o<&5%gc`5U!}l-WzxCsDepTEJ9YvXEzRZMOb!Wmq1Fqyq
z{b*DA;rpv-a~1Z1V)uQ;XHTk{T2DW6zN)wKXb_vAGC4-o0t7DnbHKj^c*(o4aGV6)
z@EufkrjG}Gz!qn8`~8b)MTwxRM#R9B(AWdKq@QW-%?N$)WlS<h(FdwkBGDCqFG@DF
z!II<0V!d`ZmQQ@^&b;=_(Dh!D$@}c*=U1(g!ow5z<Q#f73`X}_>`bU$ES5QTJ`*D5
zVP>&RZaI+K6@$+^cgJBD;{#Hh($_u%AHGXJua4Q@@$L)Leh$N%c9`A3cz<@%;kv6B
z=DQbE8rmMkOF5es-v}!R?7M7Vvs3*_BnfjvX;ETtc(&$}6?fWQWt;3fT_#s$PImJc
zM1+Fd&BG8@h;r?T5V+JU=$I^!z~T#WqK!f1m<`iWnb>Au_jo03)7Oo@mz7<Q70t!=
zuG>q=icfs2%;Ds?_$u|Qw0PDxWg~K*=x+UE{FmaaHo$Ssp~HCBB*$#d0+l*=(7Bj`
z$cy!W-9`6bD&&Wmsb<d7bVb|KM85V36YaK>lsrZ2b7WMVcVi`S)Lh7sCh!XrN(rXQ
zGYXL=6y5dTSu886D47wZ=K>@@GIEZ|ATRT}e*ad!@F3{`cdfa(&iUBt$ZWI$P6Ce4
zg&out11Gc%^B}6_q42Yu&jfxkYo3y+sp#Y47V)1nuHI4*COER4P?>r3L2e3GA)k17
zN>pZto9O_kC+3Ne7C&Bdb07`SC9Ao~-jF<o)KG{7lcTU`4@3pL3m*G0ttBxWw^z@k
z=Yjg`H9s$-W2#qgp8xjbbQ!(lU?Kk9e9_eX4a7$*%lC06*5=^p@mY7tM8QQuI%3Pn
zPAdEq9qiW@S-4l9N+hnzwF5H|)r#GJ3bdK_Ak_#j8_fu%2uqyM=X_hwBSCm`&Ar9H
z#f!78b4J+gDmjg8qv}BWX&2Eg_f(MXG)wwJs09<Kcwy+2PvN4iTvWb~yBp=L3GKtD
z;$mroOHI7VTHoyAERQWsrFbyUb)!wQs>eu{<T0uuBvuSe>O}GyLO3sdsHu5j-6@+D
zG8`r|8Ih)wc`s06Q;6iiLP<@cNH^QcwR`h(?S73JpIR1fy5osHN)XoXrD{qa!KMIT
zHvZKRqW5F$d)K!9gC1ysMq}bjIv75QHH2UL5R^OOQl}wMy*iZ-Z#*9>p%<xmpjgT1
zGpYvNA`N#s3WyK8^=C2A<t?m;T)VQ!xsP^|mX*6x<}N7y^v9iXv?Gb?sO-n!XRkU8
zrA~79j_GbgINAlICL>Fl+(y5UOn?)Pk9js~74ZaKYQT7BcsiDE3ydW0`q+!j!a>Sb
z&4Bm+Bn!-)Vlp~2l?Eb9XQxHlu#SU%W)(KJKNA%hO*5)us>^KtBOku`=N9v{^jD+C
zm<mt^Z6BST%(+@l(*>$M57!b=Do`t~?IAoy#}!pSno}e@+j-pe<i%(Sy~jbu?<2mM
z$*_1#+BB5zvvZ=lvR}2rBS^pleRhm4;6(6YRO#{{T`67hSi-@^s*H|Fx%g8mq_Y~9
zz@qQShg8iP`KrL}U71$j^mdcp`qHpD9;bDszpggw7fJvQH04I?L*@_Xy&p9f^{+XH
zC@CHUom1;D0H@>!r{`V3FzIiG)go3GN|5}NSBXV;Sjut;43}>y^#fZO-sz+3!Z%KU
zgsfN^8FV8q>?{!9>7m`Fp92aKE*SvOfvvg|ivSfj=aw8fD#2?<%)6BH2j-fiujHI{
ziZx$BtyeQX*bAi<nAomN<3zXk=nBz&hA3Y6tjTAlX@rF*PkjzRiI;bI9hrL+>YN65
zz67FbcZlMrPsMWlZncP{fvDG63XIGDnz~(}+v-22Zuj3>*ZrTY?)Xpb>ci7iCg7ic
zbyYb~j7qp$hz)yu&a$o)6jBJ9>pIzkB%PIfseb9!_J^gYrm@*=`T;D7bC#bJx)VhQ
zJ>1sKF*et-0cWRLG7vU-gehn-7`}DyIyC+l93+_Au^?H&XR9)vH9FF6G5bUidz(a;
z_q(H1zgC;T4g~rKCny8>;O5|rt#M0h{h4f_m@RZGpBEO(RH7rmkzWdI5IC!06Jb--
z;HYM)BNW}t2h*+{lbKjopIlr0d%OVx=lHNMl_Js@whBp4lj`oNA?h}--!BSz9r==S
zy@;Ta0INW+*5dmku@xE->c6u{*$18+GaJO24EwPb82c1W>VL;h6e`ZWn|r%@;CUSH
z9GMl!uer9s6VMVM6lc!}Gb&{uJ?yRrIACiPUmHEwSXcebl5%#MxN1FTA|g-G>#g8<
z(z$e^;eNIgbtdL6k6s<*OvVw+NTwh!qEglpblAv!_<FiCWJmP!Uuf6nr5;R)D0@?2
zTI>}ZYnSqI{C46e-hu9hurgLH%t+S6ls!~H^p1SL0z}0*J;<|mA2M(4C|7SDj7r<*
zX)tKwnF~{W&ew8K5B_mHxIoZAqQO%Ho|OeD8m?;4Y=AuOYX~6FICe3jJ-w>?4<dDI
z?RZ`vCEkvh4^zlb7TA}R7az5w-+Hl7|BSWJmpzzd)Db2Cb6UZt)Tmz=S#S^FJ4;B7
zCzOh}ILHnBkQsb1Z7@dCc6jNH_VV-bd2qDu+xHY6C3BG!|CHO;TeFteW4>HFGCrKx
zf}S*?ssK-|UpYu2WC||Eunq131rsvPxK%6Lk!q5D@*n&n8uEJHE~;~jmULnvA!W;u
zE|f=Q!-B8-F|%)$Q2TgGaFA|JSlJq_(sDj?IC)t!Kei{OS?S3UBa}0(OmI^OsH!3=
z;?sXfOu>4(I^f}OvYi<uU(~k7i|T+^Wj#{IMxlg;jw=_9(QRF8UzdG;;T+$7Z(1`c
z5#A8$DHD0CBnC7b@kDHtwn%GY6i73zTH&l5)kDZrd|#4^e*QdsxH933Xk5g#XRTp#
z3NmxErx+n)3x=c(@Qn^Y8sP5uGFk<YHe-ID1)-C6PfB;k%T}S>Kdf(aA!$MDoT;j{
z-srZ2&(C(I&R&H}f=a`CB9_Vh6THz7@O=40(bZ#N>hMTF)~8FBh%=n5hSrYM)eO7p
z-P&-B{gUkJByjOPCN;Xraf6$zxD#0Qk&KCW=S4rQa`Ta=V*-1Ev89c<AV>Es<TF{{
z^WA+uXq@Fn>TJ;|vSDXGbW~+kl4gLsG>lTEdqN%sfl=*~??dfz-=a30g}>(Scje{m
z6Lbqt2)+I`>v+j^#Vn5tM8w`3y*1SOh^l0**0;U+&&FB!(10;g%Nd2nm*T3j!gNe&
z8gzTcOTfNElT2cZbu@?{b$7m%r3kg?UVGo8K42oyo|l$#%j%|v0z2Ez1Zdj<3O!h_
zCCF;F6h;K{z7e*%DP@ZyPjO4zPoJ<qy`k5V=l7G6a>#12<3M<F<<JmQRG^$bi3FI7
zFm<{s1@zVW2x+T?Y>&}em|&Bzy>Y3vTJQB%yV#dfHeMGhHuYK@@3Zvs63nDxX&Qlv
zOzr^UniPZ}WOeaRm^Ej@hLn#VK^zE7SXzB-=E9>CfA{IB&Y#g@mim4&EFJr6`4<ny
zT#UKAqBjqJ)ep&!tya^V-2spj3);!GnAH)W!D99R$7grWHQ1T2-Mmj~G*v-ECUHM&
z?gv>h>rgA@&|@l7(o}ZFJ6&#ci=GN_xPg_!!Z%f7glCRcdm#2f7S?XkF1N~~QsGtV
zZY`gT@FiU4k-OgOpbZ7DEOva;O6*M>zX?hf5jjf8Hp!2y(u(EM(RyPo&UczmmB1W*
z>C$_-;@x?@BY{&T^(UM~<5Oz+dqCB@LUjjg)QoPCYqSG9IRa5J7OIYjb6};MU92Bk
zJK9)V8R*P*OXrPjM`KdcCoaJsQQz4$=d8B8_4lSuE{DIMn=`<C$XR6)D5U_rK$y)G
zz+uh5%y&nNI@Z;U77dn`MU6Xus7ke@>G&eM%cz6MIwx30sQtAx#C7m`yTA=9@gsC!
zm!CO?<pR@I#6+Nq;gC8*`YP>)#pzr!*O3=G?usd``Iic;yk{0tFe5R?QqGv96Pr@u
zs%Si@tyr2gkhdUrJVFZ9ul<{@4AA_9i7sU-hF?g1P|X}n_>?50pcGe}ST%P$ZkFv8
z+XAi%)6x$Pm@P(T<0*=`Dft0zkFEkzo#I#f1gToMl13eS^T6|T%|v14%wd!8los|x
zC1DO&;#j@>gP{*M=!c<gXYc^BgUe4s>JDL@%_NXw8Tg`zEt3^vq$`on#JYUArMBY1
z>rzi;(zot;ccIgLz49C;ytkPP?g#A)qw*)+(gTmo^oo**oXH1dxv|-$+zOq-!^c6W
z`%6B!3ky6YKiUd@MYrhw(A8LC?|Mg<p&%lGUFfB}`4|szDBGX2X8gD;JOb7K3;!vc
z<JnI?kT;v%1fE;{AF5evk)~M?Rq^2R=3HY;RD0rLTdS^g1tl>;9hkf43GbBVizEg7
zl+#<;IoRCr*VL2`unEI{cm=M_!jD<W+;1<ALVu(hRoZX;m1nEadiQ3c9>|DTnK!fk
zEIDDd1^O9(PoRPY{oh`}_+On${97g&|HUJ?4G?BHby`ZkfjU*8+RmH*Q|j1yU|Xw+
zms0FU>F3K&*<Bp)r@5q3tlF^U2)cw=QFK{8qf$pTSmZfNB${<%Rg8^u8)fG+^(q*+
zvYpXK?LoK6qFc}aK19+H0yA_OG#&Mb2H+BZ{eK8LDij{Ph-vnMQE!8ma`c0esMHl|
zI^<QHcl>LbF0UWY51lu8TBPBaMKnPV-O`HKyoct-k`urXd9I#?F`oOEqu{T86aSar
zG!HRBTcf30+a-_s1jHoq;luXLhVV$ti0A|Mp8<Q?##oElw#}@5{};u1qz4f0=N2c!
zz~M#;ps-OzZw~`~8WqA$!PJuNCzz{tVcH?n^vKNSihi#&ZlgAZ<=m!S!$@!vaSFs=
zY&WF{;C(_qR{~rLS=Ygr`FaV_kE08%5X|U88J*bbqGoU2i9jr&217HqS#Dix<tNV?
z534g!Z6;S*i6I8_#Ceav&4Qkp+Wi&SFK<t)zx~8N3ZD@F0-1pm<Q@amv##+rk4;PN
z>~JP-0@}esK~uTw_Zuf^GD+=4)8&&`gd(Mn$$gHn7L6`<C9Tm^zycUCkD9tsnM#8e
z__Yqj&!>2=EBY17*OfO|dcV9$yJK|!cNRbCCu*reG>L0(CWCzt=hCiS%}sNPZ1g$|
z@K$T8-13+&xJhx~j8vP48kj_{2GA5Blgkvdgt@Qa43-%=?$<EuSg@4t(M>*Y-JT3A
z2`)zMe|ztPOlWIC)g$T6i+~@%NI`q@Pgc&tN#cOO0TV&Npy1X4x=ymDUqh_DEs^L#
zyC&uLZZw+uM51Ork$AG?=pMzS7~+GhsUm6<BPZb_g@+1*Eb{~N2n$hb$3088&IVX{
z>8?1WcE6Zseu<PXL?~$<(K>qjz&_QM-u|lSpJrpv4U^h^!ahA@HN`BrGP#q0N^2h|
zC-&2>LLER7>J2Oc=~Q^b7K$;M8sPhpDEqNj@9R5rPMdExWRkSgOdGTo9_`R*$DuxI
z0R<Hs2tq+v6`GUrTDkL;dZ}eKyh_2Mv1#$vI+buw>q|(nrGWoAFMfZDTswjV(zpR=
zOvKN3`b>u#4b`UHuh*(6`8s?Sz1p)&;HB6Fg`3&<{h11xa43bmMlb}~AVCSrDzgp@
zHaXD6Iaw|KxF+iJbHYBgr#z~JYxh0Y!3Z~=h~^KJqYf|<b({g2aFd~EOwo9vqH?@Y
zGePuFWlezp*|1_68utclLar-Q{8y}I4v%505z*esW}qX-QLW}oP~p<NH+C?uQ_L?G
zn%Z-@6BO>ws^M|kjrH~Gr64`!PrI<K(F#VGK>HJCGpeJ7C*AG4Hmq}2=H7DZXr)r}
zgz<@^r(Uvta&9fZ(tLZbW{H$M^ayXpHtl|0N^R7$>g85n)zId6*@BV7k7ct4-ec`z
zpr-LVcj~9Ccb>}X5V($#XBJhDypV9(g+Q%|Pd@Zg3>1>~N$&2&%Bi2O@M8^X&r&%`
z42fZP;8hxQVHE<9dx0hr!a<_4?V#9jNiyo%n1;>NF4T?v2O4<qTGX<0BX>}+v2G<)
zhxrZu4!B@oNegW6DXa(tYugCKSPyv?eyAKMElim4SjxNcj!Wi4yjXuP&%U~4Jxnh=
zI;*ye)In3b5$H{XlibMx=Lb-dfl=4r50u@lGWA>Al9Vz%NB@xDXi%HL_h`bL{zhW=
z;${77g?S&6;V#9Pel$qOa1_qfj}#38rWLn4@W?`!EvEeqhNMn2Q5fWk@5tBnyu<q<
z{>u98r0D&f1@ArD>1;GV<_PeXc;6@vB%*#P9&cuGPaw_0*3EdA9}2&nOzt}ST=|Th
zePoQ1arwPVH?#e39H_hb+NkJe`OcDqh!?;N9II#K{y;lhd&}yoRSrl~cvG89g3h-Z
z>!hGHek=rb&Qw#<05;eU2|#m&V26Q;w37<!RN!{kb?w`>HFd#n5#oH}`n4~kQja?u
z&G|~d;V>7hm1ZH-&=YB=0>_B`32=P%L()PQjI$_=ZthG@of)JK&TRSdXi=9hQTjbj
z{N}`Cu>9eH)dA@+)tJk<^+x`1lpE$1XsRYoo@v8YsH~2G({$3X!~g#BX$Qed<Inl*
ziv3E1UEaJmGX>Qva#Oo{Px^EOYBZx`apWs1f`JQq8M9VxW<2ANV*C5E-NWi<gMR#|
z%6Gf;KK4aHn0f}f_hqdM>l7^<xVyfNa9Z+?%BnJK#l#?l=ze62mVka=B7f8YYyGm;
zfs-<PZ)6f#tfW1@J0eL-F`IBnXr%=#wT{SJq&^g0d?DZMp={+0!l6G&aNlnyzl^iv
z_cjZRbJN`zNjgDzkmKS&n}YP20wZ$JA38*VwzBxtN}X?o=;7*Nn-`&6nrxxF&CVgN
z1#(g=C<`6f@qS<-Zbe}ueA703T)HXgYDH!yzUi5__c(aAApe<6T9TjwZ$R{i{ZG64
zw4~X&qlbqO@MeEq9E6~NB@kmuEy09PqIb+aq+nxS9V78`T3kxMB|Y84elsk8-yUh9
z8LK&X3;@IIXsrIiFS6H=HAn@7e%6VJrTU1bL_TcfgHgL+J8_Q*WR8UR{%cA`y1(ry
zd9|0r5$VDJ)vgz2qN~jt)M@u2arb<rbam#mA57YYX7%JjpK*aTbI8XfDM(a&xXS0#
zXBg$??qk8$+27&LZtu|i9+PG<RXi|(MFE~zNrM^1k|0jQ7-!xn3cJ<!fG5ARcw;8b
z+OgGH?YQw3swZUbPBisyKHzT3sC;_|JF#=#4maNB-}yaN!R3H(THli+@f)>D*DUK~
zWD-Zy6*sb|ssI;pgzgTN5OEU$)HL#K6dRO>FIM(0D^>bo`Xej2$iYW)q-sj!`}BH>
zr^M!>&xUJrBBUTk0y4YH>7C&;Rl4HPaaapV4p2zV(4}{hZEenReC+EuI4Kx06Py+X
z*BatZTGO+ee?v_IcGn}r8Y03}|Fr=B6+v@_0DnzOOEH<uP*%iC^ggtgN+Jplc#lN%
zL_bxOteZB=gNvG%M_QBxMwxqQ7*&3Jd7;ad=Yqug=hI(XFCDUevG1X8=Q5%O)WY#u
z6|jye6+VdjEI$z&$~PKCJi7}Zq1Q{_HBPy_uy6A_SZOS7Tk(1CK9TN4ts>Gs>#f8{
z1o;|z0#0m&vf8CYKaA#)(T_K2pD(X<jM?gcl7c1U%x>F!(s-94>wLVy{C@wJul_Y>
zXNfQpAQB+eAT@AGPI5YoG6x7S!5i+QllF|~ruWkL-&PcqU&%Xj<NZSILwYEP37)_+
zvIOr9Nl#EXJFro<d~0nuUg!WAw_LPp5HZDch}8uV8><76A5y|r553f28>|Ah=b5^L
zy2sEh$|yd%@mF;_Z@%%BOj)ulsVT+fZ4K4@a{!N$YJ}yLXkANIz2-$lS<7|bAfdiD
zG!=>lH4d^Ohj%sYf!t>^>b$nl(N=>?lQKT7gGyuj+MS(U9QLuZ&ruu}5b;>DIsebI
znDK39*I5%l#jdbgP4N)3uO5Gz8*{8(2<igCgEaOXw6Fb_5uEDT2rIB#y!QsGU3!Ar
zti<6u6tg8ayuD9gCVJ^Ul$l2=q+j8t4$`8XSwb&K?Pa`Nbp{^cMGj8P>4sf=2gj41
z%03DGaOIvq7pLQm@t$Jyv~TVgHIm^)S-S&WhFVbkgGkAnzPq2fay)kx{dq_)d{gL2
z!0#-w$NHI#`G64<@$e5Mt_}H3{(r32`EReg_iw3X{<YWs7uP!f4wU*lJQVvL7=zO^
zkseSYI7l@IzI#g`Er)e+L_UuzACjr-dMsRCg2}0FUQUxbAND=I!LnOOZ^!qq3XKN<
z+pc?=QHQ>r$by<iR7AyNoe@u@U9yPNXG@76u4qi!cm9@It4h8{zWn3}WyFIbz~pkL
zSwI^^k#Hc+C4m}_Wn@ss@XpQT?HHx`L#K1R_VBpBol4MX@e>}jH_+H<Z46>&C{jSy
zSO=m60ZO4{wa>tS+N@(xX%&1@j*!orsvGcwq+^=D@St9w$^FA{p~qjk&nd3>Oj;3F
zG*u0ILHqg7q^%GkGHey}%Q^NyHZ(7{hiOsxiaN4uJC;jhqR|E)e=|tCEYUwpF)U87
zXZPBc@3GJ}F>ahZZM`Y^<W;MLF1G6f5hyn1Bub|W0*^GuMKbP*4#zBJFAx|5MPXY{
z+Lckdy)X&Zeu7c!^)Wf|^t9<nVOPqlo+R<khReI*D8=<)V!-MK#ms@xsB?~DMJ#Hi
zA!2cR{os5yOX+)8KEgkLl|64dhj1D``beB~S=b=+q&3TvfqO^adMWOe1%onQf@_<{
z8<afVyc#OcMVdpglw;;c>)t~CM$$Evk)p^9yWqfEsV)hc`$8qZRZ3mfwweQ2yGoEM
zu%e{spuUR?VH7a6Aq)Z5&L1VBanPka+l+X7V0CuCns)!KWHPlZ>tr`pg%hYO;&x*l
zar<o1E$6GSli?tf=2gdq&G3;EqJj$xN>e^(;GmdgFe|H{mpnTkb|z5JneF0@4;LPF
z!-^f?5EuCjvF43e8Pb&OA4}w!I$6rsnd8LQ{-*K=VVbLXRh#?sdFqhkOc#-#Q~+e3
zWXTx%zNxO6eT;;%A9P1T>18qGIq!!@X`X54=U2^qPTb_|)si_D$|5J}K@GrjlJ=9~
z)Nry_DgfelXuC!eX?&%h-Z=URuIoL+^vvgbcB-XF1LS<d2+5<s-K3Cjzn7!%dDT6&
z`!UH!*u+me-TmfME7~R0LO%nklA{PBQ)q%EgfJm5P@iXd;iz8u{xiNi*wj2t(vq#b
z&9wi~g;`Fzurz)<lnG}J1A7Ag7VMMJ`3=3XMw`;M5KnQ}*W4GJ4<|}L6~Aeba4)db
zY|6DY-Rzf8Wg;1fDq{#o6K;|t=B@;)5ih5?(i$EN%2*TMW3RD!-ug_`zrWi~UUT<(
zwAU&0o<K9i$Y%dOk9y})5;bP+XqJzTf=mrYo|`0aLY9YxRWX6XlxW5q=-p{6*%gVj
zFPgfO?nPY99Q`4iE+#UN^ysQ?Zo%O6mSlequ-U80+zDt#>|##payZw0fYgYkavibi
zTA%7Ky;J_tnj#+4DDVI;r8Zaw_{BLOQ*7{{cu;=~*8mwGnCQ&uXNM+z2n+$n?&e)u
zVn$yGoi+hZ^O#{;;%?~TT5D8unyvG@^ux8ng)O-jUUTv+hTmuJHnWKP#92<Wwe9dn
zsM{g<ActN)jms5r`raNM2itg!;1T=vETx()?+4Rcn^JW(Kr~N8hJaH@h1oH48v0I7
zpV8WyK5#~rHC0;Yo}q65N8=c8xR_nw?Vtps8*(zdV;dlLasaYRq**N0VW4P`skMR-
zLCJa8c14{!JnxZ=nA~H!>EPL4G@i9e(`-$(p8QSE+RMOI6tfa;K2*g(m}MD{WG5_-
z&YzBPbJ3&ph{v6*Ncts+D%ufb7KL?93`HFn_(6vgyxA9R{StDHeD>q0$zu2Uve+`*
zEyUvWj!j-?m*$%{rrqx2^iD>tm?2%zm#QwdQUb-oAo=vVen^4s(61X~c3y1vC3;!0
zF~>U)6+1;?jj%G0bnhoyua6I|nGeKPM-0C&>AQ2=zU<bmlNo8>&-f&Dz^%yMkNp^~
zrWE0Zf=P$vgKCYbB5JM4CU@5P-raIx&yfzZ5}ZDR`wrRBAHRfx$=F{Lg7FdntESAf
zQD2OBA{8Ae$$jTXbrehQ4DkfpmEWw?KbvcI<elS4AzR<^B83S4r{{r!*$@zq>7t7W
zbVzAH`ft0+7JZP%!a9{NPAR+l$=2llR_dklEmPg;`%7tAjS_jd0%kHX8o`YcMcNJ>
z9%xbs3^1h#6*Q=|2^V-s_I7?caq)>DNrq*w>MQ<>*gWJ}GJoV6Y0GTyA`l$^T<~HD
zpFBCKnXafeMSnh?=NbHk6wZ_BV{Cyw{B0dR+0}`yOnby+^LWM_Rhw$VOIli791&jD
zDv12;qpr2VK82?W%U@1NU%8QDDT9)R_&b=wj0CI)SY*Im`DM}_w?V&3mTR}avN)Oe
z=(wMCH6d>)`h|8xL2u<}Z@wBHPk6(z2oW|-X2P1%>Xx|H1CoL059J^Gx|FUK2Tt=_
zI_vBU*#o}+P`GRVBN(?|5d9pvuO(eBYVAj)Y~9f6;o6i))%eBGS&4m`MVhBa9>1hd
z-e9&nC@~t)_&m@UYh6ORYDDUYJNpo59}T{YJyF^6@$zLPJIjiOq)whwuavXWwPx8u
zd{qIBlIo*a%C9j`6s5KqCW;SHBzCo#u|bLFC}k<?E(%8)5>4y(-3Zm&eWv@$r2^Cy
z6L~za@O*&g2!yxQ&lC~u*)CR94Eq(vZ}Ai#zLvg}+kHVXlxLN9D!^4t^X*ln1{8Ao
z%{z~0ZvK2hL`5I>w_m)z_5`z>+n3<qdHbB`9-)fUl)>aYuPEQ2v5XRz7#MVVjS$fu
zwdZM|^AoCwGU5Tc1q@-mu@i>Gg;n^hSp;?$(kVX>D5wc~X0>HXeSJ_jI%}S}a837$
zOEv6cwBEbhPrIK~x*?h@&9LYe3n0@B8hBOStEb?r8?;l4sour<Z{5{w-B*-^g(*`u
zeRZubd3}m47xTnOh+td#UI><lJYNrG5!Uv&)ub~%#gX2$Ir0YlRxYP|Q?L2et5@?y
z6Ad5AdiE}MGct8yci>#iU7%n{cs~W6)LiSOjpP8-`izjl91FknSF2gqX3*}#qniP~
zSP4cl$wxX-46+D8r>NxTw!X{BvDU|BV-2d551JkwZ~P<?z3nY6vx~KBtr}&6RA$8a
zO<|^<QDdM2%24Z$OF14Ez0ek)Te>~5cEEqAYFyb>`L2n&zP;Ev=c|U7`ECqt1EoRs
zL>@*W-69V)f+b%^JVDg(w=8G_U@3p#`Joo!`Z<-RHjCkdM8%i!mHesK9vg@R_R_ye
z99+iYqdmBR&DH+=CsxW7la7LDw&dfKemu(#HzSZ-8-Y~Cd-G31W-YdUHAw~I*$Fd)
zUu45XpZj0a6GUC>=Y=k`X7ZFjyF^C4^eC|Vte$pK-<|t>qI#Ea-5wUDuGF)p`-@y&
z++aF01;DoOA$wEDAPU(DSY($FF^|WXaumOXSe;Wo9;NYe#-k5N#@mWY{p-eo4Jpo-
zMV%x_vzxB?uGXvxyEYiBt~UJ|BSR<1mvUsGJ>8j9(@=V2IreQDcs?Ze>kPAs=6k+1
z>Ytv-N@Lq>BWkpIN8m&@Z<Dt>yn&*GOnKZ7BR@s!ALd*?a`n&nK6-VldNYwNP_d0E
z2s-NmdV@;u)XfwKG>$4RzS^`yZ7Qju$3^c2&OdjzK8}(?eYu3C?2XUZr3xsH4Yo`x
zO9Hk!q+L6JzMl;*7p=0lPx<5qfiW^X-g|*PVBcN;E`1?}m~9|uGgXQg#5P~2yon10
z3Y#q;hUf38$-J=e0Ch$Dw8X12)z9aOe?2iJffDqSBkE^(68DO7V5qlK7e_PClLdOE
zT?P_%pEz@6$=zF#qrY1PlJAdcRCA@-pt?|}8-|sM16iFK2`ZWTb|JL~gWnl`f*+Tt
zc)ai`&WaQtRMtTDCTPUy2si?@xJ>aO?@d1Cn|8{&!MB5heeUdP`+iu|Oo>J0?g>tz
zrQhb}MmkS>Ma$<lg)oZHcg!ATmE(ZkEtnf~l(pr!Y2U|-{4A==fNs`CBL{mKLi=17
zk7XF!$8_zd`7p<`_^^&<6Aku^99<djI^knA_19l=YG@a}@mZgoXw6F=Y&xlTW}jM8
zMA-^+Pc==5ev$D4?HYoZH6)$RVV<IU4kyMo-s!mKb@bJ(F|!+~N6yHrie4gEzn7|}
z$gOhFB$1$O3=3rrJT&`d6v5;cA=J^835mS58RSGlSvEYVu;gaEhed4BR6dKSlOruN
zdslJS&K*o<C-LLEQpu!ggREnJL)Z8d&#g?x^B+<S`Df6D`wttm{qq){f1e%=OGX2o
zE=UH^f}MJ1JA~4L#BH&V83o%0s+I#V805PjVIY#kvCNmdKD-@seh+)Z&4LuH@+t2r
z5hk|)s13KSe}&_Vv$*dzS`q9dFmz>=NNOp%u<@ALS~Z^*OL{k{#h?2~Pb&aPmpoK_
zsQF|NviYUSMTlBmqM?}`u<F;Z1epDB0s2{@$&=<OM<L>cVqWTLU(XLxS<93qgkzPO
z!M#PRWz`#IBrWg<?ZAUz%N*M&=!#})BL&Id=PUg`%S;x&=5g0p@5Xg0QI*+~-)}vy
z{oaAJast$gF=j`Th_@oayFDvTil<6KwlHeeBqMO&!($a%EFm5PkskDc6M=U>F*sW`
z^Ed~az=;(dh~hvGG}8ejsFgbVJ4-xZtL!7wxq*_>BM8i!BS@f?bg+au615(?++Vky
z$zLF<n?iP_UuYvMG7~sw*O(%;F#4TDC@KnX#gO3d!qHXk!XcnM^`3cUr}9V0Limya
ze*qGJj{p3*dCbEE2o$%5Y?N(K=}jdvPZ?6*!Jq5Cqh$r*cg(|=5WCUsI`CcCMLbFu
zH0FZ=QKS=~6aIYJj1vCu{3W^zGl9C#Z|+#NWKsC}n=w0@utY56@C{9Q3nxm!9)Xc4
z9EK);{O~)=>^1bAwx1XN<8SJ1Etc#Z&@8F_#s2&TH5}S`+8i%kOP8ya<Ia<N#9vhK
zFH1xF%hI6qm_1o!BkXuAb&`>#bB1Qd>_=B6k~rT$7dp7^jfx0BqwAyRz2uTdo!tr^
z*<3W`DY_)#Y<!h3MC9)-D(yO~7j`|QzmXgw-vaD6{?48Mc};(P!%-tyzH2(Lizh-9
z@%?}LLjSa&)sFO>E!S_fe$K%-*hNU~?|hkm+9o7xw0wxow(iEiaR=u8XW!I_lRx2F
zRyO7U@TfIn_9H4$Pbb&FvmoK-z*}QEv$q`0mj`xHp-%&sjP4`BtUEv7ZZr8h8EJmq
z%s%2;JWRr`!D72~t<l#<=F-_c>rt@yVAp0Wq=8{_ags%B5<9fbT0W|^@ARIgVky@u
zJ<8S?^D&gU@OxYP*({VsLSDo>Huirh0A~OuaD|E)b|J&)2P_D>)f6zH`sCpPoC0{$
zHNxNJb8-}Xd?jsKdFZFPygwV8q{qtIIE}fd;T^hl9&~lR5}{)naRRf5jurly#q@ho
zJpinBe=5wq#_)UcZ?giu4Hax&8=NoD57k4&jmXp#?;JPb_v7P}2Ic3x@5Tz68-#Ng
z$Lx9wgdkZIFE;K#^(iRj_?*kC<6!7Lul~Oewf`uk{GUdg>tE~N{-6B*|F%&3|60xe
z|NivVRR^kC8UU$=Q2<0YHv3b5{2x776qf86K+j&oM-8=S?VyJ}wsWx@VC+7=_De{I
z|Nj%x`9Bh@7FyUwuU~x!#6Ek?7<;U;t^U?Z-jsC_V8jl=KM64T!BsskggBW!*HeI&
zb;vF3(_>658o%ZJ^P+bsdNsuzu@f+=_;Xh_|NrRa{L|~@{Lu_o2%lB8Mmj~d0z&0t
zME*%N>f_P8uR3ys40M%)y{1x^%rV1*DxYO4RQE*ZDYbLc^@p(ddO*fZ@VrF{XoZbC
z5t)^Q_;X0oX(siIm$pZSKP_22VCITu3*zM?(L+h$=GsrA+6NED2EztQh3G4wvA=}9
zi`zMYh{=Kk+=0-}06-lVf_~3p3+fmeZ(8&_i+~N|PT~gUBY(1e;$m(L(+?y<o48tU
zhNw!18J(*N%wFBVq;Ci{jorM76@rtGf@A+l-pV!f>ySk)*>Pr!;!mI$6&8US1zw-q
zbO94;+z1@JB{q$pB0x2y#1}NZM0-0G>7^I7(Pl*-+k1zdN?q?HTGU@qJ7Zvd%xg0D
zMj{lVL7MuV#ejzb`YyY?;81!C1>m)|sa=$=NI!lK2x!i%P}D33PkFd|u6dSyuSlOE
z&NwT`+&Ad9Od3vcmS*=c90j?XHtp+ulk%USt;WvDD1v)pCoQOAkR*+t*>VXfkWJUD
zAI31{kiQK-fPN4`mn~ze`+@yQp2u#5^>sDLr_*oT+VkyZR0sOL5WSB67q}?uE39In
zl>x`su5N)(#Q_XAjC;xMb6G}_iMycuV0nOdL#oRyV;7lps@b3KL$})S=h#g6KfJo~
zm<0qAW=iu1Qnq3AYrnHJXP7aM{+$K;!yihd3l~urql2C?n{h&6D#{hXR-XL&Jjo#}
zD~JL>4#Yw7Yt5h$fNf(qb-=g|k35hJZVkXrppKGjqrLbdt8)=fx|W~VU+rdD^^>(@
zzt`Ja%RRi1FO!x1%61xt8qgQY{zZe@^~ZnzTXBQze~e}HKO!*y?Al>}I16F@MRW-P
zLEEk{v`g0RJ2A)AUwh1A!oAi=uori4*-X?Jy%xLA*&B$NXf=_jJ&FYd9=-`U$?-E%
zu&W;f;ozwFi0T{brif4uFFyRh9|24O5!wL}f^%1Y><b`b51@F_Ev{W)&c+-gVp?Dm
zvqbb0<e7*45+YQ8|Dz&q31ckPlZ>GjVogER8*>eg+)c!2`^9{xJh+!UCQQeYW`_(w
zgfV%bBJvm>@ub?`ti9<_xY2MUJmge$1T+3N>Bz^eX50~V_wswi#)ro=#kz33EVl&b
zxL*$W;5HXe$xsi(C|H`Duv`v&*Lxs_0t-b{!CNttt}OxReY*Sz-JAlOJ83)W&vZ=r
zcN{X*?7n25V2?=J!<o}1L>jpo8~Z)E@_>6uHh^3h2thqGb$8plj{wA#4|(sP4A-4g
z*vGn}!z%4g-jm1knlh)IpTUCK<;%l@CRs2O_4EgfEZ~XZh>b;4^^ssgRb=v(y3W6i
zN&6`gh$FzT56tyL-pqnU)B95%6~y>!)cE4xCO7|}9PZcu(f<1<*Z%v%PJ#7<u^Wzg
z4os438EF_!fG!`yOgyB6xE}@@JKb<9$aE{V@V~P#4sOv@5Yp|4_Nbg?ysj`vg^^1$
zrry7^!1n*n(x~_o$hU_b0j=LE4Itzv2Mb5?lQ*col&E&elY_HY6iVF#>is^x`?OJc
zGWW*iWMtNZ<PZH9nN^1lyBO%O=%b9#E&NP2qyr-z$Pzm6LFXuA=RT0fo;+B2t^Y%v
zR@&t0p|sk4xwS22&etJQDqlLPIRDwGHnL!{OW8={`spFV9nfwA&0uN-G)<l*pn-Wk
zmUVhrmu(p7OtEXTb*R0W-04CSBBg9rj(_Q?6ZMYQSk>&k=kbEk)N~Xz!Utx#($L{w
zUd2sd{Ou0g->5+}+In-E3;Ck-;BJ~}{r)P4-y%kL!;hu*aJmTr4ZZ8=U)bw25E`#v
z)GEn6YyYebCQ`+RzGfGKU99&iEnPP`b60xKAceIHeuDT8HGtdIqzwI<kJ8T4gURzz
zKEF_rJb~zr>$WvTQbR)d+a;V&%hWyuWn3PrmAJksYihu6S~OLYMdnIR0X!eBbp&aJ
zvkN<pA;`7CBEdwgeUGk)hwhQ#0w|uM;Z5y|R?jf@`jk3Q(#7+!v}8K8ftaXzc>Kkc
z{Y{;W^rqe8fq8h&*i~IN=CH0J6iYw|o#S>(-&jt4_@yAnbk9+Xy}88>hUlm2zHwc(
z*|H@#_#H$-R#j2+wB5xKrYK#i4xrtpVuPO9+V9;F93?r+c&@)I+hyp>a}?j=jC;k)
zipYYIFQX^Ahz=QmAbpkbu4*kp1kKuX5bD=^Ka@>=y)xC|6I^oP=#n3sv%7SMM>O-&
z$;ev^5(?dg-=pUJO+Bc7V>L@mVLR@gYZYL&KD4e2NDpCbs4p{TL90;~))j<Ogx-3H
zdy53Vom^=<V9evTom{X=uKJPbu$6p)HagiH^eXWpGo^%V2Q0F@Xr`!+o%w1SAM!E;
zC)#}!HaqB1pP4aRVt#k$@X#r(8KRc#vF`)hHPbI-*gmYX3DJhEUGCp+zi4Yi^6*+$
zZ=C=dmw*ks@cJ-Vi%OtY_)df3&Fsbi>fh#)AFvIDmVQY&_qL&hfl+x31fGe@X_llp
z*c#%f&n)^eiUE==YEc4sWd2l#{_U~+A60Yy51acxH_rm7F!>hH=qtyy;CB3LF*r$0
zC$muW&?)9MU|Thdw5Ny=w%Z`550v?aBDbVZ{+(mn=2MC=z1z<H7lsC#5EKC1h-GpS
z@EK7R6m(Z&RQheo8(+TuC!6C1llkF~)gpD?NbJoMWTmDE{oGH)K`LO_IMj0D=gtC&
zgJ2~pC+utd5O<t&x+dWQE<sCDQwU!VGP8bA==jE7KPd88S7s*gEhSC0CD-j?{BW{{
z=x>P1H%i9byR|8LuqJgY`~w*IUeW3lF;=S(x<K}wwTizTt!g=l<SI5Q69~H?B+43X
z@$`hz8)Y=U8YZb@B7-1ffK>oCe)6#zUym#`&8q;J?VD2nTFgj?rP;Oi4hy4a<gZnn
z&~RIDKHD+52}EVoj0z{S&Gut@`&{$RLhNRV<voje&K|X=hK_L|ig!1t&j5GnaU;Un
zOCucXf{mOJ@>iIT_vlQ<Nt{^tUQ-<_eqXI;(I@>TYReoERs#xTPDQ9-F46{YdcaKf
z!V&rFr|ai))SWAn_tgC+yPH!XENH2DA@`RxE}p0&JjqV6B6;8wxq2GJ$lGDcYbPS=
z=IJMTi^UDT-9NX;cL<(culREfFrUJ%0@9Bq(&vrrb_Z}+;E!{VyC6Xh3~rrth`#uW
zKZ(?R?TjI>boX7uk78JSDC#gJq>73KEq71cq?|*-LcmMD+St@D-{!nvb*I;}#qaN7
zVauFuhUYAAc<=iqZ%B{}Y_uen!Y;YRl*BeH9zJG$o9$!gNo^~_J2csIjV^$(+NaTR
z)|+US@I5y5nFA%B9<Ru8?A^=-J=QDt8qUZ041Tn0LzC}{(D%ExiQIRXqVKE52I=vh
zvUjx_9!k(OX;^v=yT`dG&$b2=0ym@oJIhTt+mvAdF)Q9-PgTN&YU44_sZATR0^}GG
z*bM%1e`;M*vc-_N@Pp*tO79U|{k=wqg3eV0E-KcN)9CwanOtyuMj)w5WulKTv0fvX
zywWxrkQ^?`YLrY=fa&=4%RT!XVo2bh5T{x|A;(Pd@JN)<fCF5`v3!7}-1`&ZD|O?7
zI67*mngw+Yhi>TsQk8~_hIAvQ%=AkBZrKO%JQhud4)bJvn>xMQjI+RH1wp>Iw)(sR
z-O&iSQ_!8}>RvJy?z+2&f4lD~Dflg4l6m8mymT@LYTg!r5d0Nu`b?@GKWNp9rAIv2
z^W@;;I1aY=c%V+^0W@DwN1^oIMWi$%uIbE5K<dxdnZ=rio_ruDKYMW9Kq3XU+%I<a
z{rw>(_e+owI8!8|Q82J<aJs>OPE6=Z)5)RD)Rrs8ic}jitL92#jwojFNDK=T0d`hN
zlLoX#fl*P61f(<V5+n?ZY|?CiC<LE3(EX`vv9>XvI)Ze3gc>8HjoH7xH4Vi!;NH)@
zkpuGFor#bNmL3lHyxuew2Y%2bz#uBN)^+shMh~UR*a(jLf>vK%6ZDUdPJP(RAq(P&
z10MlBX$A6!IJg^JklB|moVy-4)>0$<F!Rmrul4WzYkT8)-n|%~n8^;kfA(0O@S5}c
z3=`iOx7?Vde<>JH+P^FY*5Bhgu(Tsh;=s+kU<B1MHe2);K7=A4-43LGiUce}+uo}7
zDf%PwWmi0NKW!Ua&19zL-|Jidp=s*y=nu~7>YUl&ZK1!joG^pV+A*(Y;9)dbTPx=4
ztr(_W$qoZN_#@cw3Qerw4_g+XmvGjYIqp#Szl*%Ou_lSM0kHXcc^4>+Xs_7T1K-Je
z<Dl;bgH3PP_(E4Wfh&+Ux(&hX!ERbKf_8IwtQk|wkbVlp^NinF^v|?rL7Je2Hv61)
z(_Uz!=<odEPw&=)>Hl+os6#B?EDp}4EC2EtAuC2bYhQ-XOM{GHzJWM&)e)DjSv{%v
zKe%#n%FSYa3T*N35Iro~aO*7Ppm!gL-P~`3(gv-3_?6#T%8pj+fGqA14{T>U_IH+Z
zf;+6`KqZqU_y4g5_>cJcZ?X{+lBnp|6;ydbU$Un;n@b*}GtzV%DgRY^qZ%i$blNy>
zK6u<obcudRHwCZztHKFqwmPu7TXdpD>0*VVLNn)lN+*wwBu2_@zb^mG5&}G*ECPWp
z9Ee}+k>uY@k5>X#-ZA@(^%~pf0cARkx4mjN_6FgdGN%>WtGyx3HRhh5MQe$dpd7za
zqqm=8^8*8Hum`+u5FB`pCNf_AV3Fby)ns9G=3-gxn0rq!5>MI4#1gP=h&U<L^(nDI
z9RdFfEUYdtw-fjBBM#x`$f}=RjKQ(h4|Nd31%Rh*G*j1O|3#|<%MT@M(xj#{dT`X2
zV`oo?8TgLl_wg^Dj$uAbqzlrRJhH^T)N05XTsLiWm)H5DV@SV(r58%ZU8!WJAN)TT
zPnWkystKa2&~%BsSw;E&C;(mp={Slv-3-9mfMjn^G5d^xu^ro>>eCkMImWkHpdfKD
z7n@~==0Uf7X;MRLSW|w{tneS4IsC`%R&3)2cFWCAsb7qE>g2Yu=WdkHeg1K<bd*`Q
zIWsBL_>3UAqP}N65Z!!Y+Dj&MSJn23+UcI29?rP^-7E}dE|yK4$pw&s9ET~=ccBPc
zP~Z>!Bx|GeqY;JfCf$<_fiKE99WI+mOWYd9lF#?O_IE!@*(Nzs7VI|m5{ud?HplV1
z-oS2GKaOJ^($rc2p@;(9Ksa|6uI)Tkd^M`$gU9Th1P4I$saSDQ!O+_uJ|xJnzuIg4
zs0VLFJP0B}1MGb&<^cZ_lnnH8R>8u3<5Z&fa>d+RPfo6rdrulQ&uU8e*h_(R+6_2+
zE#ePi`$;_DWJyi`gj4za7BMrOl?LA!^c{Yc1@x7(QHo%9Jw`Rp?0S47aA7toRNcDY
zpowz)wP)kLkGEe3Kh@3W{^{EZBikb&>9t4mseDe<D6(_gd5Nsm6-sgVcd~!RIr&>Y
z>k&P7PcOtQ=}3dG(i(i`Wk}qWE{&+%4$GByy6HS5{c2vN;o0%^kd3jAA!-EpxGz<S
zEH0DD4e8@Q%FZ6MB&mzbzY^54sB`2D@7~Pj@AVc&xAbR`^)Y-w<x4ohHNOc*zUM<a
zTJMawZrm%Q`n3Cg{a@_8cU+U*x-ANVfK=&7i%J!wOOX~EO%zlVlok~c5CK7ufFTi(
zB2`2|Q9(pRML_9D2|X$zO<E{P2ug>95?)C0Ouu!{y=#4+`<%7*@9cZ-K6n3Nlt_~I
zopWZ+=NZo!<JnG+uztG-L!2xbRyk`u3A#ZhlATgTt~qbpr^V@NBb!io@AKt*3KiD)
zRzabw*AWdY7$x*JN=0&*p`L$kCheBxh_#4t?0(aoa>s-3vomNQ*=@Rp_0ExWv-hF2
z(4?Rx<OOxm;5adNk89Au4<<I;x5S>L+C9jXWQc{b_zsgztOLyDSWVK&_x*cf{jTGT
zHC~?yym#y>^;5g$wHHakNfh`qHQ<?QIRgB|WSz*{zDV0Q3dNWUC)d@N`nqdW*W#t_
z1??qbK0M=y{Klw}+a1?JpahlN@i_Pj8BMf`u_MaN58xYK-N=|so*4B!eNE?Uc#_l7
zQ!#c(NozhCEW~B%r5SiU5NpD?kwN=I?_BI=@ZXbaF_d>1bH%D*gG=uUo#>avQf#r1
z8h4VPGhqU;Kqf2km%fsgjN6y2Cqqrz7Z^Yk-2c&x=LZ(${Y+>A#^QSb=CK@lN-?DI
zrcYo}Fjs)fov@};o4ItEAKYbtt#3zh2`Tb2eXm;Don8F$3U~{F!7QdX$Wx>JK&V%b
ztjwbtw&jSSd};Z~!}^3B<6mnZ9B`mxRqy59vb$y^4%h?S8X$eu_J_}hQbJizi{{pG
z;EQ!P1k1c`KMBSxvjsu9n!^y5X$fq~iE`Y${Z8zH(vOli)@Pr=!Yqc8qLc1AahvJ1
zMI0x=q1|x|k;6?_VV2CNrdu_)2(6Auj6>-8^MOI7f{dSRHyf_0gC2Km8v%{jcT@9c
zC?AHevUV_F6#KXfR_Da`bqNhQtA;2=vx}**^V@L0*IzAu4Ak%%2|ih7b&eT5UYAZh
zOO{ko;eE#5i2!Exz_5OQJ%V-E*o(f5nbz-hGAsnRmDjPKY-oEX*N3A4&j2!N{g>*-
zFXOi<GK+5s97e#5#f8-|2!gLw-@*BM^gc{e1jt-qbal*dI0eH>4hPU!+Jx4`;qJ9-
zFh1~+4Z%zBM>%*sa_B<m3rv$A1GLA6Ltgp5k{_I+Os(W&E^a~<ac6p!K14HhZ|E$<
zdehY6tyO*)8I|<e<9%tH)YL|M$PvN8&>$Y3=t_l&E}YNmYocXdsmyRcrGV(X9&RZy
zFtm|SdfSBN#pd)0Tb`MMQUKS&S3~i|A;dy>f(1k!A<qd7wZ0Se0m=imZ8ZrFl37Cs
znz+IgUag5o(vL8#fYxXMtjpVw1Kk&OqJL*9$q5K<oU%px7{`{=0dDOu+7uG-!BarG
z=!@^|%X@aL=+!E4k4`4rUwUE?Gq70^0Vk#KRj=F}!+hL~BW5zh93~#5F6_@mx5KLp
zTY@2g^-JQr!bpV+dMA(DAMQUjX8LKjO?--V!2#ZdOD#tDdWX<1EEjNsD&fR51;c%Q
zvPnT*C#rgblhsLGv5O^I{*!0VS4Bw`J(^4#VLx;FQhK9!2YQ$w&XQ*hfvn(Ez;Gzh
zaK5t$4;7pRM$EQ3GRWua))^+X5ir}T6#Z?1ik=UoCzIdYURmas>i|A9z(nDFYBDwe
z4rvH2@X_ICk+V}x?%NwET5*qC6AM)7?z!75tLXdK6W&?hrYmOBKi3|ZO3XWyUF<%!
zPk(tmqoeaG5NoAB=ye{RS%=lYONAD2p>zk7F%`3*^%@FbZ719|_U%%M(;st0XilB!
zdwZs#@0AU^R8{3xnadM4C$n`BJBPvcWe+?WBv2~V%u=)*tE<!<C0K6NaIQq(z6^S~
zZXHi5Sy&~BTD?=gEc__6&ilrt2izUNdr)+OumBPd&bQDoog0cBkkcbiwZMh0$n*7=
z+nT*+*1X+4>}B>AX(5463fwrUy+zlieR~nAmac>{V?H!jLARq}0W#>pabt=X%`%ZI
z255A~=LU=eXcl8{)wg!0yV!e|;-t0JrS@;=e42BR)lgljvAX=ABHSgyWc6m4^v!c7
ziko3tv$#3Tf*V*pa0)OHCJIGirA#?;$B-*6upRo3BMjQgzg-W%P+M^`-krzb`(#k0
zX<60QQtK^s1$N-jxsUFL3BrA_ac~~YRd^(VQe3A&y+~ax;&Y64j&sZyaTw~w`5`Nb
z&ue$bnCRA)9v0_@Y5kgVUUL}~7bSC^gL0Ed%hP;KvKL}l^c3y8IXrgh-o&TijW4<q
z78ZUV4}N|n-@j_bkg&NAm`^7-$#Cgl7&GhnQ)&6h{aT%!#Z|quWi^NFV!F#J+#j%z
zsCL~YID0TLEJvANc6pbaS}>czA4-P)?J7&U5f$4EM}u-Y-2wMUYm7fPc=``%mGSS^
zuCl*9^x6ZD1t15cf|<Af1xhv@!;&t5zYf6zI=75Wlq8zkknOime=y6g6Bu}$>-|Av
zKEsigk@5DJn*DvZ$&u=NIw{wgUbd{)4YAnlRA?uH@z9LGOl5)MQ{UJaa(Kb!F#i*|
z3M+TsJ>U9+&8AqgAbaP`1ys;*3h)pw0-v1)Cw<AG9EEKfmTbWgvj~7^P|~NSPG*%k
zZ4k!8N7LrfAarbb1Sj5yuLih->U@(v7BCpB82i&%+YW=8;2IG~)<Eh>)!+1VRY?F2
zQCbwUO&@aus%Cngn~l_?o!I(Vyk*k8+^CPUN55NhOMO(R;H~$1o=$IS=6i0=1DFoF
zKuLy~9<SR5mZLA`SE>EF3PU80l|0v&11W<AFyXtTB>?h52l8=yx@c2_YOnsyu~_@E
zA;Q5x&9-*Ck7FrZmUh}|*ygSzIIp=8@(N-HLz*i8bmHl5nc76|`rbv6AEU1L%1<8S
zw~J3~1||tNiM!)J(Cw)rC+1UQFn<{Z(1G-?gT>T+UNEPRdCDYDrh2Rx{_a<PJD#eW
zoroz6GRW%v21ro_f!j|MNU(b9k)mKe0EdGC+Ti}Mhm?evGJ}zD%CAaI>)^}-v53nh
zE7-dI3PX&}-;5Gg6Ha^se+Q62@GhW<ZUN{4jy)eBID%4yqA6wWJ&G4d6$UqqzdnA2
zX!5-75pngl#onvwi=4&t#iI+#*n%v^Q4o3D!F(4ch+yWxG{ZMh?~30FDwhoGDGvQG
z>*f3;;&ERtlV>{X?m)2baW{t!$rCtj_u9}`Mhx{+H31x$kLh`!YgsaumFPU{yVS)4
zzP$f;*X2n@2<W7S2J6lB_!Z<$)&U9l+L9zIN?i+wSXP;Zt+fsm#9=4dJwU59DrS0j
z&H)4P@&mtv-v5WsIjKx^$s=VN9X(D`zm|DCAGv$hHX=~2^uo1mUt%n|EF*N)zY}D@
z%C&+MvkCme6Hpv=kVLzq81}`fL8<Rq`mV$073`#*zSz$8iSzsk?%{91yFC$3tiw{Q
z0=6!(hM2jiU0N9=p&|d|b)zoWWt$*v;}kudyZV%XElV&Q%%-*lGAlF|t?C9POdM`?
zc8`2>6yiNwIaqNxhT}=_<_YelHF0PwI8wQVSzj9V)UtrH5hZ{ekgQ^Izzx=zS-L!I
z)TB0y&u>^#C^&ovjA>LKs|US1K>-xzc0psP-uaWv{QQ=@S6&J`=%*77G<{OM5N=??
zhNkx*qLRVZFAiSPgnaTgn}Z1q)>M@dV>Ts7oOMN$;w?7?AGrf;{u@Ci99E2dP7GNM
zvP%H;67GeFMQ^_d6DUSD-)uHNFRAW(#pDuE-YQpWC#*i$i5LmKc+n3`Pn@$LGzM(L
z?e&L~31`2(xbwc^+W1v9dn*4+ZqTgJ12gQv5)`1Cd=4Sts~X3polNd~qq!5v{#;w4
z!<SJ7_ky-@^2-JS=sMVGg&7FeY6ZB~LS*le8t5rK0;<zK-zgtM@>4kK`o>R6TP?}!
z)o?7?bXgV7*p6tISABwQ3m7q^#SUEDWtHwV@uTngMi8bSC#oE}{ou%5E;DU*KmdFS
zJ)T%^4%oWy4p~dgN7^Rl+^D;IOF)G~lpW2LJxb`MI^_wbsgyvmV_s?3fKQafn=2z1
zRT=L(N}y)i`MDjOg*?a9<IeN>ad0m#kmVnfgdl@j7SC(qjK~Cm__QHU_rB8FaW6+K
zzeC#n>@{*z*Up^+qta}2Aa0|af<Y=>k&X!HD9aF%=*%6fmqC<zxjp=-TAIOmDKKG!
zZ-T&()MITECe`ns57EMNG~1hsSpG0^dHLb0s!-&l0&Qj^q{N+qU<k*tw&@YsqFaDV
zW=_j6|M?i-1+HFZtjmp1H>*7#&4f-ENy$3ZAXqqTsnxB0s*;U4e5hT|dGe1OypTKW
z;^%~VC-16$%U&@wmwlrn6s$NUu=00(DQzNQ;xii*kuE-Nl83$peAV7&lrYN3+bKG9
zSM7cd<yzc4ws3bR=;-Uzt|i&}g$-!KH1oa_UosXSD9G{$F6y+Q=PdGkwqstE$H2#u
ztyYUTk0kjW5DOOsS;u7<q!HTIC8|>i`OtST0{bt2W~bFFci1Hg<V<9(C`9Jerr^Zw
zw4gmzz!9u9@W?nfGiq(phgXYXZ|eDS=9#3zjuFmDHgnp6CbE1^!+ih%^XHU<?9ZOe
z7ll#>J=hE=w?+6ZLkK&}pLuB74P5Y%wTR;PKOt6iHF$>u2u7N^-Az-yzrnq*9G3J(
z9T(rwD&g1XxwRDE>uucuQ+vH=?6E2O{1!ahw`*vLC8$hofP?jY6~=|!e@UPLE(A&7
zDOxgz3f&n)3O`hK7G5*j@)GpY+w&`|qVBl+iv|`IiJX}K2zJ|vgQR+s!)&+2153}n
zWBYpgyiOAtSUnvAT{Tt~{J;w5&2*1dhHB{UtnLQotlcsJ45c)e1J=i%%}e+bqim$o
zyJc6NCc{|*;4F;_?Yv1N#E0j!B6j4Ys}~^$&zwIrNoLA>3vSzQN(uLP!h(7Zz!Idj
z7&0Z`z5RNUu7ZntyQUy~2#e3k$bZHBnnPU)Q9g+KVr+;S>>dY|^(Z4ulbWgMWECiU
zdX=yrR;qEdkvUwEap3f%++tE|be}TzC`ekf7z%W0^v4Evi_|6rA7f8`45v_;H|;%9
zw0br5+vWNKTZ&55z5|8Nrw0ro+4&}2aPsVP$#2vP|CY`DHqQD}+y!UD5WizLECq1P
z6qX=}Oe*Tp&<u825R87Y4<iN*EVer^%Ske+bUB6`pw0M(GpT);NjY!`y7q1Gveo__
za>@OtjPb`BW`lnwX9*nD&jxLduL5xLx81?m8Fa8h&{T<3x`x}Q<u*xN33tHt=Hkoe
zb;AV#<0)YU0i>H+kQF_G)wxNS?+5l#+CKQ)go%O8Hb!BmF{{67WXuIi>EOF%by3$?
znvv3nxqyJQa-!Z|q~ye40gmAN!9V*spUc9{rq7%Ay9;>yrT1(xeaLY==<IUqlPpTv
z)ykL3Cm$G;n_R1o?>lbItbL9Z!;(@lL=tRs|E~fyWNG-5O)ifmwayX*JdM1pU)AZ`
z9vCWq8U9YkY;bMaxS{?S^E6aLks)T{7DR*qF^Q6qHnCuiw>OrFaQ~`aO-`2@eQt4H
z&`kaCS!h4yaVwl3kV}vGj!XAxp%iE#<*fGX0mJvpqA^JaoX($_*TG$3JO-6}GqBa)
z#!UbCRaYQM5g0d8ArAs%WW`GHC1F{@;Do_l!M>y3uzlq71LW)o*Zd)-9dwXx&fsO{
zl!xsTip&=5soUkL^X2Z0#L07o5vY9E!3$nQAr2pwejSu{A#b>4Lg3JthGvCgjKr)j
z^4%%J`vPAQBX8MpsV{T?=APlCRP3LICU+89xj5Od&N}e^e&u^QdjL_3(;3-2HDOkq
z<|P@grQuPUbISUJo2{Sl-Zax&{+CYKoKfMZKxQVxFA|3NCJa--&0T@$q|}(T91%tf
z-EQS*Ro3)k`*np~9!~@-HlC7lh99%|2Fb@oU?PwStq^i6^r<qbdt9isEsSFGKEYtn
zCgJH@-Y4=rxGUqo8@Bk#ht;z$eqAE<el41Nt}EVL76qp|`<#RxA9;^;<NdC+ugC0z
zQCan!qc848!Ebziamz`HTO6KHx0RSGXYhozuXpd7#|t+c=Au8zrlB)r+h<#O;tk{M
zBy9^cND^A)!9s3^uU<z8fSfxifMsIE&{<;0Dge=yJ_W?@Jnf+NQI380-fa+E;YW-c
zn5m+FeyJgbq_XEWH@F91=TuZcB)VybIF2D}NWFcaiPc3xw4D4@^)xBOB!f3Xu$M>v
z!7Ypj7BaH<hWp7Tu%Gh7lv;z5vO!$qul~4@t-kOpxq&f6FSgKR0;Ex=p)|7F;fb6!
zkYuc%aEEH!jo<okR#^@k91IBH#3<F5^DX0~sY$_Sjv#E+KEEN&-8`D?JSK>;Q`3vv
zYu~pCz?^vEZ}*)4klgOyEf3wYAJcExJ-E^Xc5475r*qc=pabaKezI-etSy;#N30!R
z)n}|?nt(OdWYZ!jTw$nKR`%jt%Y@Ln=PXwF1`&Q|8GZ#MvryR}FUubjpcS1{7K#HU
zUzKJAC#&DsF|!b|XNmjgn=}=Enj8tEj*W#~s$87}{Wh2iPAeUB1d1Orp&!OHg|dVh
z#?Tb02;LSh%24aq6o}2|$>!MW<YXapfvi5#ThCwVTA{cUaKj~BwmN??|3_+14zM8u
z2VYbGN&-p%YD(;-5^l)rC{?4r5N~ty`RI-xDR^S>cX<6Z@q`lp{uQnH3v?I2#R8m~
zsPoV|t(BS6AcBYZTHFN@VspBXlU<F4%cIrPVV<oWyl>H%%nD5Kk;80T*u*#WD3Y{e
z%tx5ZI|4>%NQ!1Mv88b1(CUmqVZc;;Ss8k~HYgDBwd+^+Sv3MG>?hkhl~ycMxw`0e
zzN2oCAHCa|_U+EiWW}Pqkx6tBs}n&viGTs~M`!ys`z;I=Y;VIh^R&Q-H%RiCZ#$zs
z+v?vx>iycnGWS29Vxg_y-s<F0!zEn!70rQZvcg`q$IBd}theXH5z5S%gl4!n!Gi1Y
zTik?+@SCT`Hg&nLDziyKcdyqeuU$*%yqCYDA(<Jc|C5aaGGuK_rT}4!uySQ7<wp+D
zc2C6~|92YXa@(t)FUc8P{kn98fq+H|y<wDNOHn{}4Wrnmrbw9Ygu;~8axpRZr>o8b
zk5QQ`Y8&APSGM+-<Is-cDrO0h<zvRD3-&uC>)#v>@UM3u1ZW*RdMfGklkyFWqges4
z#4N)2<#1t|Tv+0a{XnQ6dYZa5_R$MuY=n$WbX$gm#Q2_=XEB%R)%OVF9~)dgkO?<g
zON~Zgb9hm<<Qyr6Q4ixJRj);e*ood8wTa066ejp=KzA}#b##RF!<9{NlAHw^E|oHD
zo})~_>~^s}v*6V4U&1Z#yn+JSKJ0ZGJhr=vj_Sdw9qN@;lQ?ZOdO(HmTO^{|+5CMT
z6lWXlLSc?d(wx*AWD<QO&R#;C4pusQGdf|a-H5Jc5aqtOxsJZUs`d7@xVc<ASM^Ey
z^ht{_oVC*g$Wv?qRVM`k^*ID<2M8R#$Dr|pzkNRBJZ5~)X33YYB~4m~*H;<|%Zk%K
z+1AM5{Cqy}A?te`*eiG2f!*KT0DK2Ed=b_I!4c@DkmFplG9yl27FlBh)*Y5@Qe3X7
z4<b*Q2&eRK>a6^D@OK1E*_;&Yr2RMe+?^tOA<V}uI95HV6m?r%hDbXY3a`}obCjyZ
zb8|nqKsP$4c8P1`?RB}uc5<)}6iq8E!m8pMbWuAgw{=~SWo^#pnX)Rr6=SR0x9;@f
ztWCP6-c;uR0y-Y%ZIq}=`b0?*aAxpp(owGRdXiQxsJQxqd+4;&ccFMVu?B4U)^Pqp
zgFH~G)cCA@s1l#xYGB=ET@|?0?Q{n~fK50o#oFkL`g4YD?Equq4R#oK@>U4CJcE0c
zp<n@lv#K2Hnh6YX@J`Wj@JqfUEglp)sRV4v5)J)(n3=?zMA`Qyrz2h*j5_e~E0@Vs
z<U<Uxnjl_xE+2;hE)Z(aTG0->CT8AqPzmzaC`W-1I)|^tGvSuDiR>*Q0$zQCih<OG
z7$6^Q%WUo=As7JTa9LJKdRS~~L5?u7zI1g#X2PJCjeSbqTPciO!Iy*gVhP$&pWX*I
z7mxlz;75t>AMMWB4pq-yI6zd_h_<RukLA=&eij`KxEpcaB~!V<FvwV|ol7Q>d7XBH
za{Ks->5Xf1Ro6ASwunEF)3#T<qYFRU-3Pz81R25FLPKGcZ1ix0IHj}JPe(J3vN=HK
zWYb}#?c-;kTOXmwvI$MgC$P|ctZf=#8kDHAj-RMn0QaB&A#ogdXh{p^<Z+)cztHkE
zC@An=BmX5?#|0c?dp2zQXhb1!A#|gzBQ$fBY)sE=E%(>86<YOGeaNL!b}5+a796Rm
zSyUwY<wM#<gySxYpbsUja5nD~pUkU@eP}+sI~0A2J(6~-ep1=1&Bj!x8rp+d3BjDV
zP1g3o3Rjfek=JXYZ+0cGg_3Vf7_<b6oTmtNFcj)ToFy4D09N4)?x|3%NCYpc%o_W4
zUP$jOw*K2WN|52BsfeL=E=*j?efIVBBn<Hx{9S5Gb^c=<LE2Q{xuVebj{$WDJ^Y2=
zKP^DDaB~=~im$-I+YqaPY}uZpix%NUOQ7uQtZ`r|ZN7Dm`ks01$6%)v&Y@dg$kcH#
z21`uDQox4HwLiTjr)()|=jBuRkB=reL>-(Z)o+c^CxJ%dw0)GMlZQ#Ex5owFZQpZR
zd-{g;X~UCg5@FATIywVbg6?25(V01AR{5CfxN<hz;Ec}9_>Msp*`!JM5a%WKK_NzA
z<YSiLDdyb}-<vL}F|cjueaKa}a_{H^6xXikou89NkNXQ+>XbiOl%g@AZiX-s%MY%{
z*(@U)_hiHcCv-5Ccyegs16VWRpllTNgAq=RX-Xui%o|eYP9{G<Jqmd$XeME(Y1ftQ
zBe!LNErb8iRCa5A977Dp;<F`pSOmo7yyGe;D4P<QYuMlGCw}03jrZi)OO2DnOKDNM
zfF^MmbPZftCX}l%WHA$SNl+*)3_3%SQoNIpe7)_u`zM(cHjhjBc4!Kla8=a7iSk7T
z$i=xekWcKz#?_#0B<<FR<L!iV{k+8UnkotGU9L~aZ0DnKf7B_}2&Iw?$*Q(jWl$ci
zf)W}hYoshE+IUBosI1d4F)!;sYe17yc;<qDln4y54)+J?z<+gw>{R9DIGl&XSoM9x
z^E3u1eVmdFrp=^|XMrSL15S+1)*G<EGWKeXaFYj0CUs$}0b+96+U|Kw5u?qOK_kXi
zaGDWEQ-?g<A1}bg02$J@z1R45CjEN*hnU6(LAL5!gFn}JRAbiW*6|5!3`NQmLk4Wq
zoaeedB}1`X=La+mA4&V!p8lp18Z=V+lWk_Fp=r>5Io}yLXxSiwCEhdVJAGV;7DhQJ
zT)30&m?=)+54t{eN3ZKp7)N9q`o+luFm(t{lK^B0`PcSrC|Qbm><itr0w}zB=t8_A
zdT}f9?zT<dO)HD;eK3Jx0mGI`N16mVO}yM!<k}WyKU*+c1Y1$K>Tg*p{}S6o>;(JY
zC7%DxzqyG*3|WxX{)t{{$T|oh-nCBn-IB(ozjPgKFb-fu7Xck>7G~E0N$<adPyaEu
zbsaDH$+qjM9&-b}Dm}BO;TC{I|JDNhPx<yg#K%8OufQjg`{S7V!PfFjdS>nswSaNw
zLlraZkDu8X;7pJ95O-klgisEiIPcN=M2Idlt55GoM1kG-&wAZID!Bjf{=fU||BzVx
z->o12UmT0CbP#Um@ab#a8GuW#2?_d?K`qBhBMkanZk`=E5%u-dIbups!^PINJE}aV
zY$oaj6MQP(y}czXo6=8qhcTZvutWAFyL^gzGmEch-_llJ;jy-eXCr41OFX?U2dM{@
zj=bR+J+@6`3)aYVu(an;y35wI4hVF&5TF#?)<DCMUPpn&xQvWvX;ohl$$H^p<r(~L
z>-(HMlDD3Dh_4`S!_Ny1g_71=<I&nk84~PvFh+D_MIztph09@6zodXu?~_YX3{xM=
zddWHs-an(3?o>8vw`bK9LC&0?qR7x5QJ$D@Wj<r<dI5D<S3UL5t>`a0^?mnKBN3}Q
zxzrRxO+lj>x2hIG9C{06ORP5`VVf5ua4ik;y}OsM29FtVR+Lw!Dpj_LT{tPJ_(YXM
zxmIn@^^VMp%^NVa`gyQqt(&o67nne790>&~aInU@SUG#uX#>A|wvdBGweLq?4nc1V
zm*)US^9b0+t6W(fFd+uOM>y9Zx!1GqFyQd%VN>S&lTG;COy77_T%Do$w7-@gDc0cT
z;|$@?X;#~|YP){e{-BC3gK0VoEGhxj)sYf9|D7&_5{E{}FagknwDbkdecQK}VT;OX
zlhrj@K9>BMk^+jU(G}<7PrS*snnV0i=v{@>!DR?CAJ>_I3{Aw%+kH1>^~Q&S7}S{;
z@U+5d>9;3eC;S+=5T)G|io09PHhrj+2G|lMeCV-kHfVnY@QU0RD2IzfaV_}!S?IKh
z<5H%l=A4gMXIp%q=g+-R8+7()+P&ILg(7qkYGZ@`5(IQt0HoYJH;)hm7)J7k=+AHc
zKw5vital<v=HjBEq4f#T)H+>mqlvOfFglr-CPc_+Dkr+>L+e?=g98lDmqSrvxAKH>
z`#<eaJReNfrvr*48=CV9u=HohT1^y4%5wm*$qvYQheE{bV&rGHgFTJxp%Gb9x`tb>
zaqo;wOX@$j04D;_N6vZdYy_4TK`~NTHxzaT_$tSDoXpfc%>o6T8ysnao9w^P)gAC(
zxDmEp-T{iHH33t?ZTIwcLzu>a%x61YwYgXZDo?V<xOkHie8A;eV46;vI=ni5oodD6
z6(T==<%Jy1>qZ$COkd)h6>Cv{GnM+`*0mr(Br6_pKz^(vz-!pXX0rv;So*9XAnr>{
z&c3`Xlz4BvpAs%LK5&CAiKFrWk}T^{030-BKvd01YOs|Hop8^k3bilq+xhv!R8ok9
z$i{6Ge}fW#%P`knC!^;&(d*D0>j?8Ot2*l&Wu0+rFJ^9Bi^=IQ%}v%!17XfI1~fp9
z7>1O5Uiovv7R(Ti-@?~{Yjk~Pwl=`yWu1B&0B{+#AIz32L)}-PIGMhmVS?DX{hlpp
zZqN4?ul@JVKI(D}3J>hX+IYX<?gD<yX&6uYxYAGXNONfjVHsvA_jWb$ET7QfQXTiE
z3&b3*nWKgIGZKzX*A66%iAv2Rsm{2W{p-Eqf5-a$_1u3sRQ&(Vq2hn-IcP2g)_Vd-
zs)U$vaL)h<#>H<HMUr;ZebSbBDlx0fE2Il<PL7pTJ{KzP@a273Y!uSet4?{)hsI$l
zGaC^jLZtqTDoRy*gB(?}-Y0*^ZH^`_$8hp{*%c~(<7oV14{CNccVokP0Rh77rYpcz
zmFWSRic>*EdG1|hZH^tKIM?h&dh0#2x>}uv-HrFJ2G(5jZHD_qoc#p>9bOUxWC8IP
z!_(%wVFL5E?mh!leCMO{33>{3N8h#`Njj*qGi&wri2~#8HaEXIE>B-S8k9u1l-dvi
zoLDjQN-t2ZfG;cX2IWq9fK9UaJS?Yt+;Pl*Yu}T-oB@sZsqrhd_=aCe1szGpVF0*5
z%yu$Yh5U+V&E>wuCG6vg`SBoB;{oo0Qu_|hcclgcvf5vH=G;P?F_22hyviGJPuk@r
z(N1Ww9)E^X5oDlYx#X11Nk7&*icgID<~%2$dbegJI@aVSDo-4=2A-Q(Zxu>_QLYNL
zVB+Btmot_(`_A>eZR>lw+~xIhR^%X`Vd37uc&6Q#^n1Pq+_Cu!xoV-M_Y$#d-`_vD
za&Iy)(?4!)gKAIaZVrep9kQo7wPtz6k_SNNzg6;UH>)d}CCVxtQ84Y|4H=l4rp;=x
z6FGHZ2M7o*Q(E2in9u4EFCY`!Y(!a0V4<5*X@}Oj!uY`(ILa#(ambLq6Lk_`;k95q
z-i$I_nH8pbuc7O%)s&$@7SFVChZW5vV(Zw%hY!|4Jm0?aD5c369^S4Pd}@Dy1TvbO
z+PVb#*0!6u<A_Dr_B#zC(^*^(09tu!aPR74H|ar#3fo7E<dkKVbC+u#428wBAK4JR
zR^8?~V}H-ydu%?_;u3A+!CGoa1a9>p&_d-jmEa|tbQo5^?ju>R?E`;tPZ8}wP^R`B
z(WPCNRD4S;9ioi4t2Z<Jz)8NH8O@SK+S^hGQ$W0Q_*k!gV`qWe@xl?F#ePK{oj4(S
zWAWR!PP>jiA#*kCWX7U7(BItonKi6EtZsK>$hP1Y7cv!P_R`8A)w38s``V!Pf@0)i
z>y1cZX_p&Eu75Atd>3RZy^+e|v8Ai9zM_r57zMi`JKD1adP9mlTJbNjJ`rKBo%83D
zx7bnNUb!srz5R{R5xiddw;%Z=@3y0-LTxjBL+ouA@)ORyn7_9K!FqsQEa2$LAj7-b
z8U(&+RJf_1xZ_n(SN7m?l=l<Qb8_1pPZ@6ybm@lx<jl!YUk8{Y*Jk7;tIt!A*IOG@
zdi}I@D)A$ytA&eo5AYORiEKQ<eT7+2y!W`4r{n2CE06G?A5nMzHT=yd@+TXzU>2)$
z22z?vvyd{ZBTr$hIxxhm&VvrFRgu_9n;wwkw1fJDMNx=8K-Wlrmt6fP)FS+!ksYEy
z5VqSNWS{OJRGQ}caOKCJp1B1V!u-{t3C#yKu=j-JXlxYBUIa4(8hn(D|3AE+zvlq`
z`QPeYpxiN;e-4m_?&kn4)qxT?*f;Jy08&8rQ<&*PSsdNqB?mAMRT%Q%y7nBW2r=&!
zfvDSVvkkNJ3=<7Ld6uwzpTGu~z-qnN)up#i%Uch?e*U}dO8zGYqxUUKp`$E2mH^n;
z@-RfeZ=(|CE2w3}0I(y&`SrELu8b>F?32u80OCASQ$A2EJ!@-rXzWtbNGE?WQBQl^
zd|HjeDg6m)%1JQqxZ9W<2=Q+(Y2sE3EzqD)-?GsFm;*iUSv>?O0K~WVz}v*Fpg=5{
z=fTXwQpT~M>Bit^%;k&NsSz`+qEm}#EF%OmBogonL~KF>2kNlMQ@(cNGvlckTIlk3
zy4mDql_5fNqapYElH^|+wV_17jM0iQ&S`|<9VVVZwZt94Ms+3y-cBF3K22~rcZB!y
zx%A*nT~nmxiuuWM?H!#c5cCTDfUW~}AC7K?A=P78ws|aACuNG2oJvap7Y#U1j7;Yi
z?&PHXV*6GRaRF(96<mwM+rjo`gz_$hLg4ejlNs;Z_8|bsNcJN!B+)p#Qr5S`<C!Mq
z(Ao_L?aG~GpQM*v>7Uh~Fr(>Cm?lGWDRd9!y?G*?A5u6Kzz!i=`g{Zm>MN>7j-Bw~
z=e0aDnN*VwGfn!QD1uYW+7SBd1z9oRk4arj8Q1~{$79!ivTbNq2A4^yc4Ye9uyto$
zzH;_bR?Q{{{W7a<Q)t2WI<ZS;dXw1|3H1GQXOFwIpq1Vwb(-D#NM)7|KVR7;(iF;n
zanFf&u#mX1rWGxG0c+dFE1(1Y^qkd?C@V=VW^G5g^lI2W4J`2)*p6_yf)B%O>NB7G
zWK*1d$n4P3kTTlQ{9r@tpSsYpSQ$A!)ux%r538%oLj?|kn>qP}GOMOkyeP@@<@-?j
z5tLgURPcKCB}@gK=9zcytd+=JM;?vS?=@R9L+>sU{bVeHKy@r&KJX>F1y<%+F#r7p
z6on9J^0|o#*ZN*5k<`v|ykC-0YR&b9C^t%Xn(F%NaHT>R!<L*)132CU6m;i++|cdV
zJLEtXE!^bly%E3I!V`{z9;<pZQD$m6VJMX7z!2)K*f6ybopY`%qkZy=DfBv1b2DAO
z?cTxo^#$%wqp#dY>1tSB42eMy#7qO4j`tG75FlBff-!$t`Lk;0b<)$HJX`#nvODQi
zg4bdN8gx-?#V*G$-GPJIB;_#8&pH97!k^b>$wTZFp%hGc976)vylFMRhi=4RgAyq#
zjhnQJd<2r*jui=ZAkib>w!MoJb>43;`oI<pb?a1fPu@8ZcB0_RaY~rKvOY<LPq9Z|
zXWyu_eJ<1OPlwp)@;SZ;b^GXJ<x&ZF{X6WeS$?7JGkALRKah?0!{sm^b6#NvMV+(d
zfh=QdD)T0&tvpNUpbj$bY{oGI^<R^gHrURB{jUzO$y3lRkf8ef?q3XH{;u-$FaKYl
zI~bz7yD@`E?9&?+ZAidHxbu_EZGsiN)5Q6AfZf0N-2W#CJ^x^f?4(Dw-w+K6*j&!}
z$#y`$?;n5O#}3^DNjwhoS&+pOVQK>?Modd=xH<*F5&(PIeL2SVO=S8{Hv7$F^y)tc
z`u<5L`=?e&V%_qQKq$$<XoCOMWw2y!{kN8cY(M|Yu;>3>BZ&VIQvZyv{O=4vu7_h8
zvB6#I0AcI){k<FYZ@;DDq@Qf%m<2hY?l=l*l%TsY<y=)ladj%BTHro7evI$@CAH_f
zqrNV-s_l+CdzY;Q7a_{xdql=a&s>8FNL};M)UtEkC036-FwD#gSF5yhYIfuFXRY6m
zw}z@3DJ0zacH3O}?l+fV7*@=QIK?neC3`V<gtm7vw)BF-&m1_`!P>hvb(fNJpso2}
zGmr3E%YuJxSamAP^jJ?xV1URYu7?5Ggn63K@K3f|&L99$0fD+u-mf1m-Kb@tZ*)19
zWuG^<@0a(W*^^pWT?k}P4`cNaDsmX21?1w_;AA?TjgUiknCn!ev^cq#C>iJ63Sgx1
zj^!wYj`>sx8Xj}e;(Nbvs~=8U#mK;?I~oiyM2Spr2&|=-V45zVRgncp>L^`zuotzG
z@Uc%{9ik07RvDuo?A1HG6F)sY*mmxm9ghO%$-iB^{8zVBz>bA>VHZN^u4r(%@2WsI
zJQX-YG#5*f!E8PL`U+MIy&EwcMhlnAZ?B06<$B~L`%I~AVU#~9aQTJ*ewcaJ;ty;V
zVi-6F2;{Rqg0n6Ug46ZUjR*<G4$`Y)m0A}8!{faZiCr~8%ODoB|D^bYKj*=LlQAcL
zk$GCCSq7M%OcWToMhz{%@R{E1giRsBg}gLzOLF26c*S3bkOi}%ma&p*$zCjW9}FYM
zYQXys@5Gz?Z@`h$px5?aW(!8ygeT{$dJ<&d6h2!JH*cDk{o!ml-0_}{vq1BI0y*p}
zsE7Zu5B4Ik`p(^cX##glg`*q9=e-R~=55fv4g;o;AtuupUXsutD0kSlpP`2gCI}+~
z+$s#Dzb1#nzbB8C1>Ght>;#<!<#{A|-{mwZGC3y(rs*#eRExI#w%qd{QsVh9Bb@)k
z_P;+22gUiFy8t?HxrB4}W0kRG6}7KvU^C{)t$l{tcYx(!T&!#415;^c7Mm(>9@{l_
zYQb-Ho~Tq|jEH<(;VtOwN!Oed#&wcVtUPajg94Jp(ojF#U8K~2t$S?lD$%_sOzz#b
z=2t3_IR_Cbju?u|QrvpD9?ChIh-q_4y?sx@H$V>W&E;q8a>);SMSJmFUM2BqTPRT|
zUBwT1|60FVVPEPRGuU{_aXa*ge%<0c<kI@VFo~g8fNW{G_h@y&aX_Zy+zx47%Gb#5
z``6yqoHBQ0OS#QE{#L_J;2)){46sQ(rk{tfjX9K)C9&}LXHwEU%@fHbd~?!kU-Nd|
zb!*am*mTOM(cHbArSaK5AAQR)JzrI|s^2Z#Z&P%K%FSS2h9~pUYCRmto3G(;Jge4@
z{UKeq-KD;Tu){uUYLb7s5MDE`j*-yDd;lhdZ+>S&xb}aU5LW+hFd^*yFB8K5G9mn%
z2_cF~cSd(**)O=!b)hAi-@p>$SQWK6G4#dbkApW|Dq1}q57hY&4f)jz>L3!Z;0Ob>
zr3xX2D3JTfGPA*t1v&iwSdtKhS?6q#1N?I_1=Gv;X3i7(3vZs?yt-4~!wbIJDrmpR
zy`U6IKR5&gx?88_i+o-nkStZ<^2CUcag%L|BARPyd6O`8OlD8$!dOfv`T#1JSq%5@
z!mALu<FS{IFw+?JF9Aa9?ALzBti2%OWhbdnFE~=5#Vss<T#?@m9*7M}VQ;&3K6BrG
z<@6Z(i@_{bR`)<d%AD0qLGBEJ;!pyq?u1O9F=Qy*P1ty8t0*V3`>O($=*<@Xfe!(3
z&!ZRg#t$idxw_o`F=M{Pf8G5NSHoUP>Ou$+KAcIbhBN>R9e5+YtcTV~@H-8P_S=w|
z2dRQ(o^QtSqqBQT+c$fNRWZ+DK8=sibNss_tNmz%6gc3OCV`fVKr9Q8v7&~p_?dAN
z>eM`5BV)?4PF*fF{_5qpPv45}j&hy$VOJilZkBtYikE130I^d;ok3s7#FhqG#L&7k
zMc_@D?JH#B*=VcEINNH_<yrk>md$08cJVxpX9^=P<=giaZT(U|$55}Y8uWE;KjjcK
zXqBHTTRVXD20e)p3wNkiBv_QW@*H+sgD^uBo7yrc0v#k*4pb}zc{`rFovCMK>f`7C
zf~`>A<=Qm*!Z@oBA<+QzDync|4oi^=P~#MR`{e9>)dLs$Ock{5XpkhgL^z&~?ZtZp
ze1Dv(y6B;>-Sw_^;qqWwK@`>(4ygz&^i#daDlPW0&N*#Mw=pt{3n8B-D3ck5h2G9z
zwteq-sO){@?lGCN_w1*W<Vf{-v}{O;eh^@ZfCh5`E{;@b8y9>`TuQt!;1`17I{0SP
z#oe`VCThomXYKO2iJ0*K9XSr;H<3?A6EZ(_+0kg`O98-h47+jG$NXGCXUW>oMi(ub
z`4F_H#=f7L|MVISdM7ep)H#st;~UufJgJiL<z^_CPSWvHU;CwR&Ux?c70cjE(fA>;
zVtXNQS)@pQkRirL>x8Hi*tnchp*EqRmR5ykC%CjJBHv}N6X|7{&TQpOCExUf+U1mO
zu&FaUb?l7Z!tp|6Kv!F#eP%wa$Qf#BCX{`BO%#$vV(lWhTeFq>m*(SQi}n^EG-<Xr
zBImpwx*j^W`|x&8r~M}Pz4|b|u%YOQ%W*GRtpr{c*Yvu%QcnXPwBE8Z=k$K#WJHc#
zUxs4FqQzyqZH^aT?Wj-oJ>GUy5UZt6SjLYwI0w9c(pN?ct&F>eiU2}>-x5g0aA7OO
zkSx%JLx?g?_n9k01AR@|>OH1MG;;gK)_9a3sPruqJv&o=mJx@4Jh^B2)|YL#8rC38
z1mlCj`?7e&^8pM9^h#ZF?hGADdopR)1VILhv$94wWvCZMZ>@Ox_;ZrR<Mo@_CuD_p
zerCLYIRU`&HG><%QhTso1n6`)|K<{7H%4OB4db*dtxs2e<mYo!HdEk-?)Q@~4jlU4
za3Rv&jBdx!WWI!ZVxc{N3nvCA!Xir=RFJmq^5TijXzlJ`pNq$x@wPU{G6wG|wWi?|
zkwe9|q6|bCXCA1fW!)aD+3@tssNXl@*I;=sF?n#fd!xz*5~p&|P~d7NPy!T0B1>@&
zE3!I+;+_awEld^Nt9#{!j;Euw{48g@t5r2uD(WCv*8l3@YCa;8B}l<7qyw0>$<bUK
zJuGhu*+x(R*tHZ_r(WWHSF<3*PqRQ~biwo9vZ`lD{IStP@otq-2p=LEa?-q@PB*Ct
zk@NFks*D{9@l^N2Ec5;@24E)sPETK47tNfT{PLemI1jSq06c-kcZV*C{)Cb7$6PLX
zk1U%AXcf>^$~oPQhE13HRykkwXLkrblb51<5T?95t5aiXiMjA(;ZV!Fk|{$Lj}U1=
z9zLwqPLM#wm@86otyn37Z&u><z?=FFuLA>w(^ypDHt9E-H%g2+l&ke*z4~9@v_<4r
z@S69!zMNtl@uNtxk_j{r*ES1)L>uVU<41^EOd0F~pefy!0h#u3^uPuvRwG%#VSoG;
z0-%7m08r2IT{6Lx4PK|(vUVFl;E=Oiq8|XW%>}ZwbgRF<Gu9V1jH$yIoBr_#9{`zU
zSH*f9EqoEZ2h(avV8=|40Z<jj*hv7j1CUV0CpT;X^@LhhHyk<$Y6psg3jvR{)>Egf
z{`R<ugKy9pD6B5Gt~RZ|UN(#>T-Au!G{oS!fs_b*87H%tMJt{xDr5R@k3ZSYPW@yv
z9~_4b&(%NxR1lUyZ3O^ADDxSH5{+Tm<o)Kx5ETtqvE%Tk@_#(S<^b+6g{QaB%|hW~
z*a?MzTi8{90F?-_wg&)IVBcSF%$HWxnMud3<o)fD>5;>rBaj%c;;D(uzYbdc63qpC
z73W!PnDRlKnHJ5rzr1S)N{xz*XJ@tcvUYg~dE37!U9)xax@U-4%=Nr%s;ZK})cxZQ
z=O`sCK`qLUJ7J)F+CGR@a&@u9|743qS&))F4tz66E%#Z+;%V0&okzcN-XkX?^*QII
zLCaL{YsY<~8WB9%+!-1L@0mMKo;4S*ul`o2^>WrQF1SkK-Nz}Z4&5!j;SKj!n*Ywf
zU(o*dV))X3Lp;A{mE&T9>WhDdkNQvlj;m8DW|Al8SMv|IqQ8fac?1Xk?o$6TyZKEU
zf`k&YvHTJS>T?=3=V4^qbtiLkic-S10R2Lq(@(an>%8hJ&^RFHCr-w9EeKHfTHfh@
z#gZeoeBS4m)OW>^dTabi^%TA?+7--QG`iVb>SjFSFg3Hu(Xz+(i}K*rzK#=ymwvc&
z?FV}LAu*>Zi*<(h3TxZBmq*`Sft7o;#Jj~3Wm+>hQgT*C@O8d!;B7X*5ig4}Q9Bqx
zKZrc#n0^g*>FHwJr=-ainaG9g`VS|#=e~<W(S?{M9Gqh5r+iY9p=r_3@%Bpa0{1zr
z<6a~Dz<>ys+#PN<>R<vnqy*ZJU~RKMuM!~S%i23QAT}!9(uD689QxWcHQW-ru;Pb6
z0j+<GIAasA>%!=}W>2+oN1KZ?JgIy(J08U55Fe+1e}?CP>63|zWd8_i`xQK<snM!+
zAkCFw0$S60Gd;fS1#!u3W1jBk_g=)h%;?3ErC`0c;JHWDHwdXLJ`egOU&_Z{@@aFu
zV_op{tNLd(R^PQ;WBc^Of4RNy3F*GPp~u}-aPQ!z5c-dY^C$qP>u9~_()=oPR^F(0
zbzP};>&=UTI*h$HFYza^fqf%70~ps!8Nxu16cbQs9ay820rT*qF@aT6uq^(IBis~-
z0SEkEI0#I<q3g65W*RVpLfW!K0tCA>-;EpgR)o@Boo@^H@HSnYB_3B5E;}D1kbX-5
z-Q<G00X`soZAc@xyMcotPuaAucu-+4>NXayFmm@^MCOt<Db(sgm)(9gSoc$L2Fn!y
z^xWpibo&fn6U1;=OR4n%>&ZGQXRoECEY(+S`_Lk?%YV-jC6y)UIPfE_#9V_R(w%l<
zudnCxD^8D7_j^q7(9+u3Mun=b2~BTFV2JdAkQg|2va00j%+;Ai4AH#g+I&C5M&Z$S
zsED2)Zij2G6TX{|y#iQhFt}3>{fiwFh>_wS`<_0^XLV4NbFOrx-`%Zs-JsD(HkPcA
z(g>?BcOuroDM#%gRma!-JxCmF5lS?Cx{PVX+Voz%l`6IKBKNKjjK+iz0z=qpVxR=A
zRk1W;Le6hG%cwZJ<|}heJIE;E)4`<?Aa7<i_v09L{~cY<^--{IXmnbcF=T9aYI<Gh
zBMha|?nBf!Z!$%;nfiqYu9T(I?>-I++qh;TsQ$7JjO`8VMTD>bVSAkyp!F>8$nf&B
z#~Ht<{hTB)bU)Z#Z&3E~Ls;!Xn0HS2X{mMb&N;L>MJ3i*k*ZvAq2STVcfQ9DCkRzL
zMrvXM)1Cx>ENKB#AMK4H-gA1bP1TAHDGIV-F|sn_E7UX9*w}+sm^~@%#bhu^(Sbk*
zB7JZayCpa=^}g29V#%c$!sC?kTTGb&j9=GKOG`!$UVay-Hy834a;|9V)1}e_r*-?-
zvyGinkZ~+tP#EK^p}hH`Prw_J?(4eTRlJQXITm*R2|K;KT5B}eo!d;k9i<0;OA#z*
zIPyTDH@0l*c2L3znZ(0E*Knqf9<3~7M@P1SYIGcim=01P1n6`4QIaThD^pOBxftzZ
z_ys=q!Vf2Be1iM#Tn@#p3sM-2D^{w+GIJR+3WsV(<_8Hrt{!mpL&Z|$(tFq2_1cgP
z6<`6Xft4U!E^$Z8+<5r2CuhlUu;0ZwnZUtM?<4AwU-d@v(J9k}clPsIq#jietu<k`
zoDHL+y2_yI_KE0GvdQ@!?RI8*?j0{sVvs(Go_6yYWNT6S$m#sen>-5V3o6fW%YS2s
zLPNku9RV+wasPhRRD-<W(JFb$+9O54q{X@L)6Nq%p64_?KHsWY!}(XMfDylsz2N&w
zdZ`J`4ZWZ6)`@@7%aBCQ=%o@5t7I5@eV^hmo7?0mFDrjn<zm8fk0*Wc#T9X@Zo#eR
z{xDG6&DOn!F+$4Fjx+N@a(1x?s5csN_&!;198p-{mc!WXxeKnu5ANd^cpkwGYRIzw
zijPd2wN_u1_n&UW>Ahpe4jBdbt8elGo-f-xoLC5_I0T4-Srm(#-$s7c+d0`70vGl%
z;8<eM6xE%VT120mi_EHo`!7M7mU;WI!@xk-Kqq5Fa-=RI&!3~rFkK-@5SmKaivHzq
z&;cSK@#}2_m}w{-j>fsGF$)+9UPOC2H&hs<x__UUIWxq>*FkP_Mlv(#O=Rocw?WLF
z@&*E_Wcm<R8vf6<DyenxCmM;}2h~F)HV(v10nz9^(Dw=0aLIuIay1njv)3;~!^u|o
zqdb0qvL~kFzVO|q%$SRoVOw5X)MqsNDc}J+<v*h(@Bel!^W4e`EeCv!Gtiyb(!o|$
za{=E^MNnD$3O<nxVq;#P>vNLb&N_`*A(2NL<6v43J8CPDCEdoXFtpJSdhl1H0!US|
zl`vA)1QXC~&_ZcJuRUKBS7z}iE7#{fRUXs$$rf@InXzU2$3uJeU$kMn*nr@+6j%IK
z>s{hozYn`MwCZcqrjBWWy~UQsE!5IY(0F5ml2;7Aet(r(%6!krmCeg}p5{+q%`AW3
zW>H;PV?CLA&7n#is17#=8pN3Q;Q_EAGOY$efHq=6M6UM%fr4ogOTrd=w+uRzYFmDN
z9V&C7`UK16Qf|xEtFau*1QIvSek6}ZrpRI|a3ELkCh#%DdMoPJfto{>X6&9Pn~|oc
zC~{>`(!{^NM2!1VeCi&zqcNU^n~(437wmR)W;tv1U5qHT30Z~b8LLx#Pl55PMjfPT
zF+}4?h6{X@^p>oUrMA?h@JV@rp4HlD71}ZBC46>on37*lP32+XkQ1aw=PVZmB0T73
ztgk?AacU`!AQmjN0~Nx|1&3X{v_P{frrCl-qqODU7OqtY<2aT*WTksM-CLPh=^^{t
z|A;-es`9M9jGIe+?K#Zd{@{hr#ptY)h!|t1Hm#9p7VkkSb^@J9--|jOM=WVxirEBT
z<K2|mj1RKXl_hUyX@PGuk0cVtEPQ3G6AV-@x9eJx+`oe@oy05zmZ36?>rerm4?TSG
zGjLUbHJd!%BkJCl>vC3q0d`VmVX^ZA>b=doc2}$09SiuR7Y-S`C!<51Y}`U@7pwd#
zx4IjS9A3MZ!14wytmDD$IRY1~xcLkDDDNJY9*~?Z?){D}i`wisSoUqPqNCO~)AQbp
za{OUWe&14dS>ami?X{wHMG*p&!z|t^vY!cyhx7(bAjp^dMp(6*3(y@CU!NV~;FrYr
zSIo|I7ju!eXH8h&j?pd5@MORCeAcsn_lCpqmwBFJ3q6`qDasn|1CBMDQ}n|OL<JDQ
zb0S;F$qR{B0+F`XANrlY8Vc{57|^A2+|t{0tpw3ebUkpz=P^B_&g$BWW^-b1xV
z-VL1U8MW@#Sly<bRLv{(l3By%CASp=$CPU|VIg%vz?|WvQ>PZ#sp;hQN9w)}+iS7?
zrdthTH6)LfmR3i;@b%U@uQRqm$y>0Is+{0{RL!1>w;vLMQJS$_I>4dEFb=yKC7m_o
z?T{CuIJS^a58qjE=CfnPgVdrqsya(h2@oQzP~db2p9HYPndN4zpYQMgqpT71l5to=
zJjzEzxyLO{{h8$1@h1iiA`fN`6hOcZ{cnGQ{!OQ8@yy?QQ`<k20WbcatKR-Qp+&9e
z`$G29E9>~%uQ`PUx)hVd#RMm_*-A79%;5}f%EC||;~?2@OZ$xz6DG+=$5qsV*jtQ@
zh^fuD%z_di+^-xG5JZ|x>vYp>ChSw;g5_(Eck5lBNY>ps68N;Xm@5!>bh&FgC}9J1
z`hon8gmV>eyb6iT!y`Zo*nVy_`x+Og?tXu4%M-L1`{rTb-+_!~@vVS4^rLh<T!Z8J
zn!h?~88W49iW^R5XCJd)30$}8>`So-6U+yJ0hq5Q;nj(R99a#`P%oxk`Ww0Gwa#Y@
zAmK(QuuuzF;uH*0GA|1N?hyK^;>bIT592TP*6F-*t&QP7`%0MY(8B&EBbKuSh7#yC
zpIL&_gJ2{?d%$*{$z=~;p)KPdOKsc5<hAXrI5|E}+=63NOPWB|A2o*<9q$y3qbs7j
zLr)y8S}#TJ{y{K_^St|-oQ8uOiong5!YUj~P<v5ux*+P(-FeOSSq<|ZOCBjM+vI|o
zC-lGrf5gab0M#5}@ut6VZV4n3hTv_j_9~y4c@q&5gTg_&9aCj{W}4h{V!UkWC`?m8
zh0#<_3*o(Z{+m5~BC}|d_bG=PKkb(Cl$z!49C`Lkh>)O82`AG4$Q>PBF_~pA#QF*o
z*k|^BAvpvleLM)&EYwMVyl<=34#a)><|Q0C03>VYkoRDm4BqZ<yL_a{C;Cf*2%6h@
z7;inYgMbL?mvx#pFk}cCj1OL``eu5Z!qKvP^e5ZCj7__$K*gg%e(P70({N*qH+kr<
z2uk3M8q@SEfMwRq6u7+0wBE33d6XyJTc&%|+K%;>W6_m<jSY_WE@=cBGYq69@%kFV
ziCHsW{;}u4!2*U!&!tfeeuaGX=6RNQvKB++I*T`_*!Btvtg@|Jsd+7IexC}EF2~+o
z-V4XdEVFaSJgEMv^Q4=5RmIgSC5>QVaWH+V4SeOov7Cz-VjQ+YGQQ{wTAx|V+HbGe
zyOCb`=#6p)&)5^ooj=(eb=t)iXgKjZ6QBfXxZbk<;q*OV%Td96p-<AWjVMDdrE1*S
zYJJm2cfV<HE<021jBN*9&R9pE(bIT)fy<ItZ>Fmkn*%Z@u3=Wo=G{GHxIDv!D(lO7
zt*<3LZrqk#Yx*8GPL!Q`R^W}H_%1PoamP}4DlqJGSs!Dr5w21lOHEsnJnIrazO$m6
zSOLRs!WDuvOTc@}jX{*l^I^V=N5&%<!W)+YV@JvULb8L8W-*lo^8wFCnW<-?3Sx)N
zjo4v>dJh>tGeQ2A6!jH*OEB&JaAJb9bW9Zl^9wTV(c1K{c<b&NYh)g=`pDg!v=LFG
zKbNJ#*dE8?Ss`sGX0!l5k*l!>#F#H}3p?4TmA&|&i?_@=&VeNktP%}SW@-lu86lK?
z$9FHf`er2K>`8b3TYG072xY%FeBDT7N!holtcC2m3@s$pv?yg4LSj<M*k**to+XMb
zSu4vVTb7KSln{}TWl+{(#xgEuxu0L>yw5q^_vt>*d!P3`?|I(y+<#c+=W@+8zwNvH
zzTeL`lKk2VMBsoG8rsV)2wYNHAzQc|_Z~EI-W}(}7OT68we7`>Ct*ltOWe=71SvNm
z7tc3BQGU?56QidcgU4Jxk8CxSN*s9YbBGQEk|T^WFpa^B4b9-}W3<OfF5tP6Y%h$a
zX*Ow2R?0E2Ot!#CUZ5CAN^>>_K3RYF<`k=xrF09mEUxyYWGz;E7P64kqKSoo=SiH;
zMXi{BObYzkSEu)UWIxPU1%IC>OM|MEL)!=05Qfh=WTB8|rFR}y$Bw=ix?jrV!ja{|
z#M4xRs)J`WsBp#9xKK;w7)e_oZ#IzmM`f9pXNF*hpa&Bws_L2K10>B-nSN`ocJG?0
zon8;R39Symg~3Y-KNEu?@X<5Ty6!uVQ}-Uz-=%Lwx_-<pH-<@Q@hBQ<_CR|6)F_h0
z%<Re>*Hn<j(|8OsUM->Ukx+wVXS8ErSUl^-cF3=DM7Z;dkrL(=C+XPXxWi{YV;XHR
zo@v8zN==zBd|GuTk8U-tuj^AdCi^l(K1@jbUV9#T9he1oo`LlNtg+yRt7X!FQEP8w
zqPo+3&H&z5)Gxy>dao?S{g!ATs>^8%Jp?L`plbzyGKp?u@b<R!IdB({f5K08M%^~>
zj+BdNSbRmX{YYm4Ip)Co*@l61(rnBk)RjWutorcK;hl4qU+A+d%4p20Pc*++0B+y<
zYSbGr?eb_QVy^43GGusn(#53k7FF@CxTLS9VHU$*HpzB9U3^<;D;EP;z?Ba*3bl*%
zO~^~*l(6?%Z#5qg3Jcp~!bw@_j>hNuq|GO@o_gRMKrI0i+nvC7jG*3d6*<(cr80}-
zE}v~Voxs!aK^+m_!GSPT{Y<9wfREqpWJfYXLJ9%}{?gM?VCQ`z^7+Yw^MWTyGHC^O
znM(|7<4p~+c-kux!5p!tmui^Jz*>H0s)}nf^>7%Y)(R^sIO&DMboa2a@YIDz--#Yv
z<p*0IGx%IAye|hlh>2}LIVJ8Y)IXvh8DQhihmcV|cD}Fb>A^5soMo{VO|*cjc(vzU
zhBDMh#@Zwh?j)FviGFqt4&p!F%I9{c-0@5ItX&TmAgiBq2iwIorcuq(RGKWE5>wIC
z$!{<+>2ZC--epffZ3m<?tuxyuf3}^$)ego}|58v2oC9|H=;<dKTsP0xor#;+t}z@l
zM)YnAbYJHHZ~-*P#oQsu*fvYRN#O&SNEMAirCXQTY7}eXnD;&-gE2$x62Y&vPJx>#
zV;W^vK_EGxfpB>4&V&-Iv`(d;9-PhN+d@XifX_Nb52`zOV=z^=5y@(GeE!8t*V~Vu
zHMVbJ9c)W;(Qoa_WP(Zo8f3m%0DauoYeV<wa}R4rX5Ls8Tk=t+>ou4r5Kx|a=%_>@
zVMO(#y<QHl<&itwHy<=b-|Xfevclu>dtyImM3}zc!75DK-};wC%`Ff(5cqq<c6pK>
zgByr|B>T^pPfO!VbUmxVrPwq;I-kdEe$DJS%?d2ezvSKjYv}2zaJ)Pvn_ltF5y<+S
zgbk-{Ck0PihntWr?%97(Hzz9yL=+Ec^zWWSR&$>A_2+kt=zp^!VTk<z$0ss4G%!>x
zs{jqiDnCHGU*C~jH4$?^?IcOykw26BrJTFY)l{2VR>{hsrNh_bwUG)hPG(Jh;lV4l
zGdQ7h;0+fbwFyLoUwUOo{#BrpdeaZQUq2o1QJhz&bw)eF`Tlcc1GTz)ze-1mKP3*z
z8IKzR_zqK23V^pYJ|DKCsX%v0i(s+&nMBCO!(S?R?{kTy$Y@<$&arx!gHJ%>FPxZ?
znY5iQyNhsoo%yLgWni$6A$l(4QG43x{FkY~aDYnY;3rG0Q$jw#oj{I<Bf)8mEx>!C
z);+D8)7P)a@@ryIyURPWkuRqgE5N#y9eL9eEGnfK&_zPDi137hdeT6UOh@XU^WU15
zKRd=<r{U%8ePcq-AxyhCO0Ud*|5IV9L4Hty&kDdZ?Eu6V<7_vyPH`j4q))sC$^4O?
zb?y_FZbgOGL>>RIkR3(8RvLa!y&AtuD(urX(Sdq>dM?fp1H?^L3QfQrIA_yV5yDy_
z`r?Jqu4dwdn@0|hkB`{x7lE-XQsSk7g>MrA?9089SUfvw)c4JOS-g4pd`ZUgaLF?_
z#u4*piaQ6o5wGZ<>4`W`q9`Q?asx^PUKPPp6^h<9lcFVggltfJ^9z>?;w9Vreu0q*
zL>WFfS5iN;QkMe)2f>XN7`G80g7h@;4v9*+`q7)`&IN&&bC}^2KVn4&&O6U2QJPpE
z?ck#FJV({IudpMOw~W>OJ&kR0EE-ekL)%#}DY&3AAuglDLkoaz5~}OOC%CvZtWXkO
zTjzB=Xm!{`jO){8d-m!MrahJ$RTNpErnoG{VoG6mmw=n~aGL>l6_An0TvSN2ZbntM
zTeFG3_Jx6ofnx`XS>hW)uf_So_nbbp&~LK+xc+j|Z8s~6ymCLE8OlPGw{te4AZ6{z
zW-IkPrtx&W0Qy}fH%R?)sO(vy*$^<%BE^-NZUzY=?yvR$ochD5*a7V+|8TuSzQ@fM
z4_=pla}7QY+^6MALBQI_Fa;>ni|V)7256GvTII<(sGD|-&s85~L=p?tPmki8Ip~i&
zw=%6<d<JXbdApoKj35;dunpL4j0QV42Im|R3n7{_dCUF9CC~ciq?Z&;zIe5qx3YJ;
z=%Te)783=V{U=3y`5?LzPO((~22|&h^S?iZ)6%S2eO*5M<m}@$AACEp0@R;U!SnyH
z@F7+3M`aRiM~xV3%PtaHkjc^rDDrz1^?kP9UyhajDN*qMa?r>8a@0gSsGcxsvd~CQ
zY{}R`&t|Zo>cPZzS4n9`%ag>@pvVD#<wtuNDqjxfj)!%QmnaL~2wvN)Jzms#qX2e%
zgX3JN9ZnScYE4#Pux0^cfGfo|z#_oBz!7oeGRidBuDKX74&trrr63i3m_v?x<-Bvz
z)f?3@x&0+YJ&c*VCkv~qDp&d)z6Cm$X*I)#awswoQgERRY=~_1I4sR#Qfa%jY5uIl
ze$rC#WwJCjvM9;3@{#A|GmPXncTs6y&&7PPx(*vWsbiDma7t~~=B;Y-v-Xj`BlwC3
z$@x4SMMZ0HaQf_y8!~CDSEVNza0Ix}qYVU)j(770h?2!q;uH+T&A48NevnKU9AL7l
zu8N&7_gOrJYDoNo%|^3=?l^S-=L(Cm6@(DuOLt&=>b`4Rlb|qai=S}sLY;nm%R;`J
zi+!0|=kq8L{u{Y3Sd-fAo|aF%GdznDmuHJS8?Hd`x5X1N?<$PnlW)T_ctUJ7$Z@ot
zf|H2WrMmz+ST6{@ZMEa9-vd>yw=smtS|R0C1KCv`$K@Ekj=p@?B-l(ZOAE(!zZ$<L
z8jiy>YM{9SJjg#Y5$}-w85|zu$!0CvpFkS-6N$`h$fen`+~5?}Ios{B_JGw)x3a0u
zU0A2=?y1LI!8lj`N2kmuXH6~ik?J-%Zt98X<0HD)N{%<L`zgjlwLpDfgOFq%0s!6X
zHQVw)PqO9=L_g+yTzh2s*1RI&J;5<yoaiX_yof*iaJ0lnl3fGhll!DBkq7*zDu^Og
z=MtGeM9zOQ3E$zK%Aa4ze5rg;z_Mn{lJn%33Hc|2Nz!n+OIp$67HbDg^;6kpi?tpi
zGS2TTyK#0EoJG7dD5*BLJL<Sm4t*Q`EfAG@7~NzmLw}^NK~<-jAtF;Uc4i(QARbr^
zam<;0|DyctaV2Nun7ps&0J&Y*w*y}{NK{(T%DFl!PLDF+MPckJQrkA$8&mmTw^p=V
z+2@d~CwD%+H)wySF~|LFlKg81Uw8lHcPoG3d@1i^oju&={ZmdQM%DUo{0JA19k%>Y
za~|+_e#;0}eNNlC3iq$mcs5%j+Z+hLG~l_nysWJ6a;ooFFR?R6#f%<3O51YY@Qp0f
z7F#Q-0^?)mPZ$rSkjhLL<_gid&WIp>Xm0UC-)YS<d@j*Z%}CO?inpyxVs(!-SCOs=
z<1~N^kKyAer6heAGsf8i-e{p^#}p9AdWU?f5LseEfYxe>3n7PU`g1SDovK(K(8)_U
ziF3WAnX^0W1br5iKElxi*PHD)Zz(a+B<c6jLXhO4(z4j$<CRF^!_p5P6dmS$8r$(g
z?lf1u6|9bZ>6;e|a~zP&TLoK`AYE9?)-S~dFmyMb<;cQaq_^S~U-KOX9zAM%8rROO
z7UD?Yl%cUFwJqnX9fYOa?)6Ak*y1cu>s@2$A5okr=EL<VpzbR;-Q|$qSjPbNEVtIc
zf!$&2lr+dCr?FP`jpO)0hM`K20<k=LvL+M#5l9yjq8m~iyH_+}AWg4TRNoM>Q*|xx
z4x{YnbD}-lhdUQjUpg+)&bNg+e-yptB$$HFfr6anZl;II1W>#C31`YNS5S^8OOKv#
zTwJ;jR3lgA%?oQdV8T<9`txFF9#mw&)w+xKLj|-fJ>o~b5->ee9g{&O67^ZIO0<qJ
zDJ+YPM%F0E;z#g&N*Ujd0_shTJV~|{Mmpr7qOW+1Awz#!ukWsMlzWnsA%F4AV+`-I
zVm7itynaL;2_l)1-3zau&1m9CskF0D^8{=9<z%Ni@<8rRL(Lb5N_Dw%#O#04lVCLn
zLa)4e=GK$`vRfp6uHnM1jD4gQ?G<m)^W7)jg=`KIK7+pSWTWj8J(KF5j&5$VEJ}Q8
z?A|RjzcW3{QCQ1HrH$um-wR-tc>qBfqnV;IVndUt!l~>#Z`EZ?@tz(k$Yq<Cy@zu{
z4D~f_mqeAdkBWV0QGEy_`sJ02Ithz%Z)_C}0hs?bnkiU#AfGST7o{-@7o?#epFDM=
zORZy|=Bs8H!F24pGSAfwwWHt)u;Vm}uJYdlv99$3NzLi`6T-6ZZw|hTzcb(F=M@|L
z+VL>B3Pd}~>+4)gPK7t{upAXpab`PqXTxA2w}%D22mJXRxQUGZfv*S6fJ*DVhu0T3
zld<Pzt*9~T-OdgzuXXhLDINZaR2)rA9*pnhUdLFo%s4#Dr~{rg*TUz7vR;wqqT%{(
z4yxG+a5o=xgP#F_%#H;cFh{*>TY?I%aeMU@8LZ820*OLZfvnpCG)U7;n?4B{<{0N*
z;vRgEqwCO)h6XXfc3&0+$%-M5K^I2)U71;^SG1WsE0uA^49@9ZqN?5DSnynx+;~A}
zQp`dEK0ZAL$#CbVsG{}+_)eHtVHCMj2#SY{)W&x_do9_hBg9jQ3oOU{gn~8q7U^z{
zalCwQ^m6g*IxnWtto_jvx`P(L;)T(Uc#}Xkp4D{;#kgSc%q^Vp?AEiRtUlY+s#p$o
zYKh&fLP}-;S6KKO?eKue2PGCp3$EOTT(j|Du}*?=ePQ^O@`Lom_&zD_hYRwXV&J$k
z(*^WH6MD?Dvk)anVS}FBI3O%AOg**PZ1(=xxvvpgLSEgYLgOw>Q?^J*u8Jm*(<0y_
z2v83}YugUd;(umxLxBp2cdCMT<d%(Fedn}^8C;-Lv>`0Woob6|Z1j6KlWMULLg3oN
zucgyZ^L3M%Y~x42gR$Iv9PKAkogZKc`Y>MK42gu%_VWbSizQq(d%b>dO*quZh;X-V
zXW#<owUOxP$;W62cqfA6fGN!;e|5P-1R12e_r$Pyt4xKTz1<Q3j@HH2n%bS{%Z9*|
zWD~tQuiRg+L=))+NNjpyv&FiOtPXP2`oIV6b{2m>#wvKi;B{Q-AWbmyVs%6b#(j+4
zt+y?g*LhrMxn;Hc5o+XMsX5`>9Kkd?uPMnv;gq@hs*wX5VQJ&(dv^=Ueu;Z`X`3iY
zwCn>e5FnTuPns~;HK=Il<vtT&R(mc*9R1?lO}X$WcQ)s;Ba^Q(S3&HtHvf<=ixkGT
zL~_){hzT;1m`5I6@Q{ZMd9U_S!i10zC4&8+GD1F9#jYL@^FggDZCS!7maER-Ah2Xy
zxdJ`<ws_WI{L7CSvNkjhKfVR~P+AP;AASjv?dq&UtCQJUaC|0e<%#Wus7{9YqleR~
z`_=6U_4a;K%yvB*z#l^gnZ}JTU?gLye&>&Mhe>TqHW}xHUHZL8j$ddA*i!Auci)&z
zHjC-<Yo%|F#p~ei5jA{Rjktz`d)`HrM@12h#~nYJ-5w7#<OfM0Oz877fGUH7q~PJs
zRVk<F8TvchhHlH^)my4rScg~GeZsXDhjgZk{U~6+XDI?5>WFg7N;v|EZz4vDWyr`g
zgp<mhF^{KosubggGxRJ*`%kmI7g1WcN!^R`>@5dHVQBKCz)<d1yq*Vlr<8)Ba8uLP
z>rCBQ{KQSIP?{p4-B<r$KK_$>kbD~I?B0f3Y_8(+S%38m{7stHM4dlyA{+sXeKJOu
z=pIR4qTSiWsBNXq7~cLqxA8wuV}9>XG!y9IzsK#Taf6Qidx1rKpxpLfgUGm%AeQ(3
z`$r5~<|fyz2VHX@h*G!@K9&g9T}eXP46r(WSbCIsV;%H1omV1}s_@x#xg7?`{+^D1
zBK!M0k>n|t>u{8LdSv}B@VJKXD_!mscL*mv@*p|4XXMgV%Uc;IDUY67DktC<hd|}Y
z?Lo%I+dA|`Xo+R?dzkTYBs0F9pu~p$SkE@NTFbfvKQ=$gX-C{QY0LN7*K^LP{jAW}
zI)!1yXAw!xCyTk?Y3^0;J3+6@#_fU=65-<$xJ@0qFc@Sk@~fBMfd1lt2KsZi&uO*u
z3}Gn&^nCU0nC*-nG)Hfpv!k1AnukQOoA{eM-K^PHM02FBALos4oTymixdUVI#)Qt2
zM9-7ZQ8&l$6(@A5vtM%v)Uc0R%pQKQ++bUHcYIj~+o=RObwk=%Myv%PoWUCw_sf@s
z)Lnv3)elm>m4WT_U(S3s#)@-61lg5^h2JFLf+V*01_(O(?@s-yDAFmfq93B|cH%1M
zU8AOCS=VuUFB_Q@QW_yFf($c<nsXp@(_pLLm+}jKVOTzns+0>PR^E1nEKeuH!U_bl
zn8g&5rs@JpIRZ$%JPFt|+nP;&-EjDgibv%X4z|o1onL;8?p6+Ikx~#)jyJg$K^jP7
zi$*T=A!-J4Dv#7>45&SbG!5dOc_n&ySG1R{;$-(HC5jZ3L*C91Xm5!Nt=~BqaLj4N
z?)2*q9$!;0F*UgP$Vl(r5~r*wzXFPi39B)<f#A$}0D*(MP=}mfHx!7a2#cH7Rbi%h
zgiU8A&^4$sf=ScQOcK_vT4+tuVp^dY!M&wl{Yq^`>^TQ-5k6Co^?l3frysC--D#E3
zfzFY-C%oHYe8_@^F9xMaM4DfI^7;#fw)YfRwZ2~Xo4pi=4r~YRm&5U^(J7$v@YCJ`
zb7l|ycBUiceIyNt@-Xlf5(!W(U(iArf{^Rqw76mHFY96Fr{LJbrBoNYSXkNT_%0wQ
zxM+4eULa-1abfY4mD1RP8-@nqEMk5Z>KG6jr=5T13!JDz>fHTV5W)H-$u9`?jo!bA
zgJc1{`9?EkZ)TevKSK%{piUID0(gIl@(2psrklimb#U*{Z6go&Yj;~KisYM)h}Xa3
zD4cb^bl&<P(oM72T?~pgci}dV>r4HH?kD`sCiTs=`A^9u9r#D{&i)G;%<o0_%P$Wy
zw)0bsX{YFEaQh_*D2JdKQ8F@Dg%@~sE%j@-s+Csw#bM>R)5(Ws=1PxQNL+C0-tA?D
zdyz>*PLAEnEK9Ogr9aVUo8SqrvA7$cRxs8*0pDd~)h+RCN0p*Rr;u~9rvO~#U69JC
zZialGgUdqTOFTuD^33c$T@kAaRg=>Qhz0~Jt_2Z|U&{<D*Uk?c!XoXQ4!MW1i@2~Q
zdos16k6L3I-5#Ti?eKzT5ZmP2u{-xWTA)qo${nv$&AtRys8ixSyGph$so4vyVu24h
zc$s#H<kE_Tq`Kl-dBQD2ElWZscy^#{8BL4KNy+TfoY4ohwI!ctaBnM~bzY&oC^&OT
z%QMp_aak>6uYOE>Hasirg0WmP%XO0vX?7xvUU=o$9R??($6yCNB|t4_4AnQG6qDyh
z=)Lti>yw*ni|JF_&M_&04)Y*{f-#?$wF%LT>UW8@4NA#nbi=mka~0AqL3u?N>z~#O
z_c#S_E$i#se*o+0X{qP$YHRcou5#yQQ|yB-F%1(ZAdnL1TnCG;p@VwCoPLv8nH)tu
zH3sE0*zO_4R*+WON4F*j0oVxV{Jj{Br`2bB%=lYaiHm#UjfSO88MPN^01zu~yaJpu
z3D*mnlJzKd5bq@R-oWO?S01jPqO7buw5{|cW$`ZFW)V$yZ?bF|IrGv_%=)@_n%5cp
zNO8%i?HyuPf(wxjN|P@<q9SN2^fI_h+;+wX9NQOPbr_%J1}XQ|ihaCLyMijK6#L%z
zOJdB?#ess>D<M%{31!9+`6a<WqNV<rp_=n&hN{z_z+8h+*?4;s4lhXYM+X1QB&sft
z*a^~Gb&r5;JKZmhkSe^FDYo_4TuH9#4vzbV8dGq9lW|8<w*~NEhUN{)CDoHkT(g@G
z`td)xzfVNuy4{GMa_jBn)4x9)bVxn4l3V*U(<b)U^nJR}1}XcmHN%+`e~dyY!u<w1
zC6MXUZ&da91q9@`P?tYy{~w3H{5?DwU-}p=sSv)o3{<Z}{4@O1>zL)UK&74UM9{aL
zGxP(fi(@=1EI<-{#C_RUzvXA9m*?{^vtt*rQye(#7<ABzaW-unMwPa-V5F}_G7O(G
z|I_GPf7*|+)0vEId?qyIqZ8|}9r~WzK=gYAsDBw?#8`N3G4=j48`;2w=nNd$0)Eq9
zu`~hA&Sjtlu)C2b@NF`w2<UKy3_j`=@z}ab!vd4k<`z2*#(>C`*W@V1HWlm)Z0oev
zX8K^WsLqqfo)Cq2BjonZtex97_{&yl@)Vr;hw`qQCtme>7orvy7AIU?f}MH9;$PuA
zJ5l>hSo|zH$I1Xvk}8Mpz=xGRL%(NmSCIPLdy53!m^;Y6`&((RCgUGT9_mVX$|uIT
z%0F^=m(aOL|2FHlRVVc;mD4qcE4wak4338rEvS;9C}x=nG?edNb2dPyhqkq{ik&5f
z@4AVj3&l+#S#wymqbO`-Z~McpyVeQv8>>+3IIuT$!AMyfh`2K3A}_A#ti6_D4B%%x
zmI<}}`N^LhnYYa~87*vJXTE;x5=CIWO!hN?52C5N4l>C~KQ>yDO}=-0=r!K)TJcpO
z!}qVoA3`+uIe-)?8w7AFHNtYhKB$jCwZSw;cp-1q1dQK1QW>tC{@h{_R-?<wF`E~Y
zbKI>_vpOlA_+b;RH#_63yDN2I_#$mLJw2|y8<IuR_#_!SK`gy-K`s<KP00xsMd>?i
zTh1YY^n*ROd#XCPhxb*rC1<IIqZ;wIOKl}4n_&FvW6*)jb_6rF^=({eDz1f#xfx#&
z=B(?HH%J*xcXkomWx_ln`Fy)uc07mtzT1a}Pi=wqjI|?&fT=e|R2!e6Z6OagQ%0pe
z?5Phq8#y{7X5xF*=cp?G?!qUsED~$&jGnQVJPRMG7ASuNX^^;moXW#zlxzC+lj-en
zqQFZ>&zXX=FQ2sA6sZOIbDU{!tanqi_O(f<Os+^C+4&;yQl1l~f5V4znI5O16qZ>b
z3)D6@7%afP26`|FJnvjXsN2heRb}N?{&F(O6QAFRKNVZ_4S3mb-3p8R=4}buRt8KK
zrPYVdqVW|t-hC2ni^u6Q^v4*-xS;~d@xBT!I0a2gF9?q<AuKIQDk*mLp8jd0>{yp1
zk%^UaOm5uX!k>f~hd058-V#>rIO;iEZ-YwitGiBKPrfj;+>*joZ#<Q_qVlH1?^T}M
zDbcZHymU4qdrc!oUK?lELn{S>p84%=@I4?um1j{Qz-3})_o+89O2u0bSk$gA^vA&@
znkWP#Wl{o2Zje98@o*)H66v{!?aF()7hZ0A?8>7J*_7J~?jH*|Po6(pG^dz$Aj#K0
z=ac-8;|c#W&~TE$E>4zZ;AqP5gEtyD>UgNF@bepYe=EpP%(2l@8!(+jF?4~7{=u2{
z?z`XZcE44(#rZdg&s$r6qXPMVo&5AyTlN>%zQO{>VFxdKzRvD_t>rJw@*U#E-#seW
zR}KG>FIW(qbYqN`vvm@L+pu9~@uRqaTVoq+R)8_4&Nzf)#{J#;drUUp3+J?tAmvXj
zAA^%+gxE;`5`2`u`Ay&k{Tk``fAadDhJ*aaVOrQ}o^KIiC`{u?oI`hvtsWUk+lgzH
zU>`IPp&c8LA@dIg8|*%MyD<Ce;Nua~0_&j~tp{pMU1H%^SGU(5z~pqw?=N?_p#Ols
z_e^3`9b($z-8V-FX678k4$=s9yMY~<^S4ddndgyHjLs%Zce(poYI^tY_&4~=7nv(W
zwt=fY@N_$1+JeV_f*6-&u*}10esRBUz-2#|`}6z2E<O9q@DAvPbX*vB;X#?uE+FN}
zbNat-U_?I6p~{1YH|PnxQgCzQ;|hHIOXemY15}JY1vaVnr@w9}+}xiXw@@TYqZ<9+
z{4CWr2nPE!aK)bo0~8GPHD!o{UoxT0h71AdFyZ9oKHEA{N;8C&%51T?%`IISOZiIt
zocHn=Ghf`2y|WL?Eye@6!h#fsC(?3_HXP^YG_JH|OSwu$vz`}Xy3WKmCs)$9<NMX{
z@2#pn%}*6|wVWV?dbMp=Q)xJ@o8Z6YY`<C~Yd??sjVP%b`z{y@UpUvua;r@b)VZkz
zZiHQ^=|(uH-;lU(GA>-VH5QjWC!s+Kv78g}Xz{sFwttSaPvtGIlf4kX9)iWO0!?e3
z_?d})wkAJuyvxKK9N%h~xyinKk>F&OcCR?tx0~-sxObEva{o4)w~w7H$Y;RN#)1(&
ziyp?j3~#dIMfot;<8tf*j399`TdU<MTw#EEWti35ZtnTpRi)NZ#rw`3P$;p{v~J6~
zcS9^D(h@!bhh)ZRy9y_HLJj1u!boS}kX$~NCkpNwCwMVRd%Q~LLjWvSYhUhc{M2)B
z%AEWm`Vq|Qb8Gq(6TR&Fq5pT;2>%sSEuZFf&9_;f3Xf$)p=#q6Bm3)1EBcEin#ew_
zyPK6x#5GmJAqxXekPR>WGgEyU&F$ViV-Os99Ne|S7g#dL@RR*24|37&0zm7=vK@66
z*B1B7>p6el^{&iVstuO5ig<U1Qfs#{llidZFoVB41U~%+vjk35fp^v#Qey<9FgB6o
zEN&d}Hg2UCu3`eB6Q;A#m#cYx@$SIu_T7^O0rk2953=%)&{|>c;J7{`Vk2_u`y1x`
z;;oqec=SJd{oi21cxC9lstV-)NjGz>wI|Lp@fVXE0a-|Vzq}U={#f2W+{ymfnf{ju
z_+wN3-xGcRttL!$q4lu1<H@~88+;bwwKlNt<mGqE|CdAiF!pze*Zc2(*X4gm<KNPm
z{##9)KXjn~pL8G#b}y+L0jA}y9S)Rzx|q8C(=>Tda=JU=XQr`}42`@z)ZykD>A~S!
zZyor8_&a2u0EK?mH9PV!{N1h}vd$Se%-@31vv_{~e?|-aVRg{&tiS&$fB(IR`l0^-
DLO^x_

literal 0
HcmV?d00001

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8ac09f6988893..fd741ea5e9766 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -114,6 +114,7 @@ Documentation
    usage/engine_args
    usage/env_vars
    usage/usage_stats
+   usage/disagg_prefill
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst
new file mode 100644
index 0000000000000..9fe714b4fd856
--- /dev/null
+++ b/docs/source/usage/disagg_prefill.rst
@@ -0,0 +1,69 @@
+.. _disagg_prefill:
+
+Disaggregated prefilling (experimental)
+=======================================
+
+This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. 
+
+Why disaggregated prefilling?
+-----------------------------
+
+Two main reasons:
+
+* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
+* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
+
+.. note::
+    Disaggregated prefill DOES NOT improve throughput.
+
+Usage example
+-------------
+
+Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling.
+
+
+Benchmarks
+----------
+
+Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks.
+
+
+Development
+-----------
+
+We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
+
+All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``.
+
+Key abstractions for disaggregated prefilling:
+
+* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
+* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer.
+* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``.
+
+.. note::
+    ``insert`` is non-blocking operation but ``drop_select`` is blocking operation.
+
+Here is a figure illustrating how the above 3 abstractions are organized:
+
+.. image:: /assets/usage/disagg_prefill/abstraction.jpg
+    :alt: Disaggregated prefilling abstractions
+
+The workflow of disaggregated prefilling is as follows:
+
+.. image:: /assets/usage/disagg_prefill/overview.jpg
+    :alt: Disaggregated prefilling workflow
+
+The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer.
+
+
+Third-party contributions
+-------------------------
+
+Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
+
+We recommend three ways of implementations:
+
+* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL.
+* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`.

From d263bd9df7b2f5586910e5d006a11ff11ba7c310 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Mon, 16 Dec 2024 05:28:18 +0800
Subject: [PATCH 57/85] [Core] Support disaggregated prefill with Mooncake
 Transfer Engine (#10884)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 vllm/config.py                                |   7 +-
 .../kv_transfer/kv_connector/factory.py       |   3 +-
 .../kv_connector/simple_connector.py          | 101 +++++--
 .../kv_transfer/kv_pipe/mooncake_pipe.py      | 272 ++++++++++++++++++
 4 files changed, 352 insertions(+), 31 deletions(-)
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py

diff --git a/vllm/config.py b/vllm/config.py
index 37d062f7eb079..fce8011be4015 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2171,13 +2171,14 @@ def from_cli(cls, cli_value: str) -> "KVTransferConfig":
         return KVTransferConfig.model_validate_json(cli_value)
 
     def model_post_init(self, __context: Any) -> None:
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
         if all([
-                self.kv_connector is not None,
-                self.kv_connector != "PyNcclConnector"
+                self.kv_connector is not None, self.kv_connector
+                not in supported_kv_connector
         ]):
             raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
                              f"Supported connectors are "
-                             f"`PyNcclConnector`.")
+                             f"{supported_kv_connector}.")
 
         if self.kv_role is not None and self.kv_role not in [
                 "kv_producer", "kv_consumer", "kv_both"
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 015f892cec933..3e2bb436d24b5 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -11,7 +11,8 @@ class KVConnectorFactory:
     @staticmethod
     def create_connector(rank: int, local_rank: int,
                          config: "VllmConfig") -> KVConnectorBase:
-        if config.kv_transfer_config.kv_connector == 'PyNcclConnector':
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
+        if config.kv_transfer_config.kv_connector in supported_kv_connector:
             from .simple_connector import SimpleConnector
             return SimpleConnector(rank, local_rank, config)
         else:
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index bf4f40ca94e29..4ace03ff1184e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -2,7 +2,8 @@
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
 The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
-producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe.
+producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
+MooncakePipe.
 
 But the logic can be extended to support other pipe and lookup buffer.
 """
@@ -15,7 +16,6 @@
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
     SimpleBuffer)
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 
@@ -36,7 +36,27 @@ def __init__(
 
         self.config = config.kv_transfer_config
 
-        logger.info("Initializing PyNcclConfig under kv_transfer_config %s",
+        if self.config.kv_connector == "PyNcclConnector":
+            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
+                PyNcclPipe)
+            logger.info(
+                "Initializing PyNcclConfig under kv_transfer_config %s",
+                self.config)
+        elif self.config.kv_connector == "MooncakeConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_distributed_pipe = os.getenv(
+                'MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_distributed_pipe:
+                raise ValueError(
+                    "To use MooncakeConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import (  # noqa: E501
+                    MooncakePipe)
+                logger.info(
+                    "Initializing MooncakeConfig under kv_transfer_config %s",
                     self.config)
 
         self.lookup_buffer_size = self.config.kv_buffer_size
@@ -44,6 +64,11 @@ def __init__(
         self.producer_buffer: Optional[SimpleBuffer] = None
         self.consumer_buffer: Optional[SimpleBuffer] = None
 
+        self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+
         # 2 pipes for every rank in the world
         port_offset_base = 2 * rank
 
@@ -51,17 +76,26 @@ def __init__(
         # and the decode vLLM only uses recv pipe
         if self.config.is_kv_producer:
 
-            self.producer_data_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base,
-            )
-            self.producer_signal_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base + 1,
-                device="cpu",
-            )
+            if self.config.kv_connector == "PyNcclConnector":
+                self.producer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.producer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.producer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                # We only need to initialize MooncakePipe once
+                self.producer_signal_pipe = self.producer_data_pipe
+
             self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
                                                 self.producer_data_pipe,
                                                 self.config.kv_buffer_size)
@@ -70,17 +104,25 @@ def __init__(
 
             # the current vLLM instance is KV consumer, so it needs to connect
             # its recv pipe to the send pipe of KV producder
-            self.consumer_data_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base,
-            )
-            self.consumer_signal_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base + 1,
-                device="cpu",
-            )
+            if self.config.kv_connector == "PyNcclConnector":
+                self.consumer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.consumer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.consumer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                self.consumer_signal_pipe = self.consumer_data_pipe
+
             self.consumer_buffer = SimpleBuffer(
                 self.consumer_signal_pipe,
                 self.consumer_data_pipe,
@@ -260,6 +302,11 @@ def recv_kv_caches_and_hidden_states(
 
     def close(self):
         self.producer_data_pipe.close()
-        self.producer_signal_pipe.close()
         self.consumer_data_pipe.close()
-        self.consumer_signal_pipe.close()
+        if self.config.kv_connector == "PyNcclConnector":
+            self.producer_signal_pipe.close()
+            self.consumer_signal_pipe.close()
+        elif self.config.kv_connector == "MooncakeConnector":
+            # MooncakePipe reuses data_pipe for signal_pipe, so we only have to
+            # close the data_pipe.
+            pass
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
new file mode 100644
index 0000000000000..8e4358672b74d
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -0,0 +1,272 @@
+import json
+import os
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import zmq
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+NONE_INT = -150886311
+
+
+@dataclass
+class MooncakeTransferEngineConfig:
+    prefill_url: str
+    decode_url: str
+    metadata_backend: Union[str, None]
+    metadata_server: str
+    protocol: str
+    device_name: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeTransferEngineConfig(
+            prefill_url=config.get("prefill_url"),
+            decode_url=config.get("decode_url"),
+            metadata_backend=config.get("metadata_backend", None),
+            metadata_server=config.get("metadata_server"),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeTransferEngineConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeTransferEngineConfig.from_file(config_file_path)
+
+
+class MooncakeTransferEngine:
+    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
+
+    def __init__(self, kv_rank: int, local_rank: int):
+        try:
+            import mooncake_vllm_adaptor as mva
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        self.engine = mva.mooncake_vllm_adaptor()
+        self.local_rank = local_rank
+
+        try:
+            self.config = MooncakeTransferEngineConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+        except ValueError as e:
+            logger.error(e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
+        decode_host, base_decode_port = self.config.decode_url.split(':')
+
+        # Avoid ports conflict when running prefill and decode on the same node
+        if prefill_host == decode_host and \
+                base_prefill_port == base_decode_port:
+            base_decode_port = str(int(base_decode_port) + 100)
+
+        prefill_port = int(base_prefill_port) + self.local_rank
+        decode_port = int(base_decode_port) + self.local_rank
+        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
+        self.decode_url = ':'.join([decode_host, str(decode_port)])
+
+        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
+                        self.config.metadata_server, self.config.protocol,
+                        self.config.device_name, self.config.metadata_backend)
+
+        self.remote_url = (self.decode_url
+                           if kv_rank == 0 else self.prefill_url)
+
+        # Initialize ZeroMQ context and sockets
+        self.context = zmq.Context()  # type: ignore[attr-defined]
+        self.sender_socket = self.context.socket(zmq.constants.PUSH)
+        self.receiver_socket = self.context.socket(zmq.constants.PULL)
+        self.sender_ack = self.context.socket(zmq.constants.PULL)
+        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
+
+        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
+        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
+                                     decode_host, base_decode_port)
+
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
+                                d_host: str, d_port: str) -> None:
+        """Set up ZeroMQ sockets for sending and receiving data."""
+        # Offsets < 8 are left for initialization in case tp and pp are enabled
+        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
+        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
+        if kv_rank == 0:
+            self.sender_socket.bind(f"tcp://*:{p_rank_offset + 1}")
+            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
+            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
+            self.receiver_ack.bind(f"tcp://*:{p_rank_offset + 2}")
+        else:
+            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
+            self.sender_socket.bind(f"tcp://*:{d_rank_offset + 1}")
+            self.receiver_ack.bind(f"tcp://*:{d_rank_offset + 2}")
+            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
+
+    def initialize(self, local_hostname: str, metadata_server: str,
+                   protocol: str, device_name: str,
+                   metadata_backend: Union[str, None]) -> None:
+        """Initialize the mooncake instance."""
+        if metadata_backend is None:
+            self.engine.initialize(local_hostname, metadata_server, protocol,
+                                   device_name)
+        else:
+            supported_backend = ["etcd", "redis"]
+            metadata_backend = metadata_backend.lower()
+            if metadata_backend not in supported_backend:
+                raise ValueError(
+                    "Mooncake Configuration error. `metadata_backend`"
+                    f"should be one of {supported_backend}.")
+
+            self.engine.initializeExt(local_hostname, metadata_server,
+                                      protocol, device_name, metadata_backend)
+
+    def allocate_managed_buffer(self, length: int) -> int:
+        """Allocate a managed buffer of the specified length."""
+        ret = self.engine.allocateManagedBuffer(length)
+        if ret <= 0:
+            logger.error("Allocation Return Error")
+            raise Exception("Allocation Return Error")
+        return ret
+
+    def free_managed_buffer(self, buffer: int, length: int) -> int:
+        """Free a previously allocated managed buffer."""
+        return self.engine.freeManagedBuffer(buffer, length)
+
+    def transfer_sync(self, buffer: int, peer_buffer_address: int,
+                      length: int) -> int:
+        """Synchronously transfer data to the specified address."""
+        ret = self.engine.transferSync(self.remote_url, buffer,
+                                       peer_buffer_address, length)
+        if ret < 0:
+            logger.error("Transfer Return Error")
+            raise Exception("Transfer Return Error")
+        return ret
+
+    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
+                              length: int) -> int:
+        """Write bytes to the allocated buffer."""
+        return self.engine.writeBytesToBuffer(buffer, user_data, length)
+
+    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
+        """Read bytes from the allocated buffer."""
+        return self.engine.readBytesFromBuffer(buffer, length)
+
+    def wait_for_ack(self, src_ptr: int, length: int) -> None:
+        """Asynchronously wait for ACK from the receiver."""
+        ack = self.sender_ack.recv_pyobj()
+        if ack != b'ACK':
+            logger.error("Failed to receive ACK from the receiver")
+
+        self.free_managed_buffer(src_ptr, length)
+
+    def send_bytes(self, user_data: bytes) -> None:
+        """Send bytes to the remote process."""
+        length = len(user_data)
+        src_ptr = self.allocate_managed_buffer(length)
+        self.write_bytes_to_buffer(src_ptr, user_data, length)
+        self.sender_socket.send_pyobj((src_ptr, length))
+        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
+
+    def recv_bytes(self) -> bytes:
+        """Receive bytes from the remote process."""
+        src_ptr, length = self.receiver_socket.recv_pyobj()
+        dst_ptr = self.allocate_managed_buffer(length)
+        self.transfer_sync(dst_ptr, src_ptr, length)
+        ret = self.read_bytes_from_buffer(dst_ptr, length)
+
+        # Buffer cleanup
+        self.receiver_ack.send_pyobj(b'ACK')
+        self.free_managed_buffer(dst_ptr, length)
+
+        return ret
+
+
+class MooncakePipe(KVPipeBase):
+    """MooncakeTransferEngine based Pipe implementation."""
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None):
+        """Initialize the mooncake pipe and set related parameters."""
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        self.transfer_engine = MooncakeTransferEngine(self.kv_rank,
+                                                      self.local_rank)
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+    def _select_device(self, device: str) -> torch.device:
+        """Select available device (CUDA or CPU)."""
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def tensor_hash(self, tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        return hash(tensor.data_ptr())
+
+    def _send_impl(self, tensor: torch.Tensor) -> None:
+        """Implement the tensor sending logic."""
+        value_bytes = pickle.dumps(tensor)
+        self.transfer_engine.send_bytes(value_bytes)
+
+    def _recv_impl(self) -> torch.Tensor:
+        """Implement the tensor receiving logic."""
+        data = self.transfer_engine.recv_bytes()
+        return pickle.loads(data)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send tensor to the target process."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = tensor if tensor is not None else self.none_tensor
+        assert (len(tensor.shape) > 0)
+        self.transport_thread.submit(self._send_impl, tensor)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive tensor from other processes."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = self.transport_thread.submit(self._recv_impl).result()
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
+            return None
+        else:
+            return tensor
+
+    def close(self) -> None:
+        """Cleanup logic when closing the pipe."""
+        self.transfer_engine.sender_socket.close()
+        self.transfer_engine.receiver_socket.close()
+        self.transfer_engine.sender_ack.close()
+        self.transfer_engine.receiver_ack.close()
+        self.transfer_engine.context.term()  # Terminate the ZMQ context
+        logger.info("Closed the transfer engine and cleaned up resources.")

From 25ebed2f8ca6d747d63f2be9ede023c561851ac8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 15 Dec 2024 13:33:00 -0800
Subject: [PATCH 58/85] [V1][Minor] Cache np arange to reduce input preparation
 overhead (#11214)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index abcd4b007a326..67166fb05085c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -118,6 +118,12 @@ def __init__(
             dtype=self.dtype,
             device=self.device)
 
+        # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len),
+                                   dtype=np.int32)
+        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
+        # a faster version of creating a new tensor every time. Thus, we should
+        # not make any assumptions about the values in these tensors.
         self.input_ids_cpu = torch.zeros(self.max_num_tokens,
                                          dtype=torch.int32,
                                          device="cpu",
@@ -269,11 +275,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        req_indices = np.repeat(np.arange(num_reqs), num_scheduled_tokens)
+        req_indices = np.repeat(self.arange_np[:num_reqs],
+                                num_scheduled_tokens)
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange = np.concatenate([np.arange(n) for n in num_scheduled_tokens])
+        arange = np.concatenate(
+            [self.arange_np[:n] for n in num_scheduled_tokens])
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]

From da6f40924609e084ced486cae5b4ddf97133acd9 Mon Sep 17 00:00:00 2001
From: AlexHe99 <alehe@amd.com>
Date: Mon, 16 Dec 2024 08:33:58 +0800
Subject: [PATCH 59/85] Update deploying_with_k8s.rst (#10922)

---
 docs/source/serving/deploying_with_k8s.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst
index 7dc076dc709df..cc3606f0df851 100644
--- a/docs/source/serving/deploying_with_k8s.rst
+++ b/docs/source/serving/deploying_with_k8s.rst
@@ -162,7 +162,7 @@ To test the deployment, run the following ``curl`` command:
     curl http://mistral-7b.default.svc.cluster.local/v1/completions \
       -H "Content-Type: application/json" \
       -d '{
-            "model": "facebook/opt-125m",
+            "model": "mistralai/Mistral-7B-Instruct-v0.3",
             "prompt": "San Francisco is a",
             "max_tokens": 7,
             "temperature": 0
@@ -172,4 +172,4 @@ If the service is correctly deployed, you should receive a response from the vLL
 
 Conclusion
 ----------
-Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
\ No newline at end of file
+Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.

From 69ba344de8683ec4d3d42d11ae4e147a2a302da8 Mon Sep 17 00:00:00 2001
From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com>
Date: Sun, 15 Dec 2024 16:38:40 -0800
Subject: [PATCH 60/85] [Bugfix] Fix block size validation (#10938)

---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5a73c6ee02e0c..0aa367a173b6c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -425,7 +425,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32, 64, 128],
+                            choices=[8, 16, 32],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
                             'set to max-model-len')

From 17138af7c45eba3aba3e9b84a3852b4ba81e460f Mon Sep 17 00:00:00 2001
From: yansh97 <yansh97@foxmail.com>
Date: Mon, 16 Dec 2024 16:15:40 +0800
Subject: [PATCH 61/85] [Bugfix] Fix the default value for temperature in
 ChatCompletionRequest (#11219)

---
 vllm/entrypoints/openai/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index dfb7c977dbd43..6ed7c2e9dcd6b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -211,7 +211,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = 0.7
+    temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],

From b3b1526f03906c935e6ef80a2cdc971a65fdf7e2 Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Mon, 16 Dec 2024 17:20:49 +0800
Subject: [PATCH 62/85] WIP: [CI/Build] simplify Dockerfile build for ARM64 /
 GH200 (#11212)

Signed-off-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
---
 Dockerfile                                    | 40 +++++++++++++++----
 docs/source/serving/deploying_with_docker.rst | 26 ++++++++++++
 requirements-build.txt                        |  2 +-
 requirements-cuda-arm64.txt                   |  3 ++
 requirements-cuda.txt                         |  4 +-
 5 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100644 requirements-cuda-arm64.txt

diff --git a/Dockerfile b/Dockerfile
index c1b6e1bbfe354..123703848749c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
@@ -46,9 +47,14 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
+ARG TARGETPLATFORM
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
-
 #################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip uninstall -y torch && \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
 COPY examples examples
 #################### vLLM installation IMAGE ####################
 
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
-
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
@@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10
-
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+    else \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+    fi
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 14d94b09e9b9c..11a9f12fd17cd 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile <https://git
         current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
         for vLLM to find the current GPU type and build for that.
 
+Building for Arm64/aarch64
+--------------------------
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+.. note::
+
+        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+.. code-block:: console
+
+    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
+    $ DOCKER_BUILDKIT=1 sudo docker build . \
+      --target vllm-openai \
+      -platform "linux/arm64" \
+      -t vllm/vllm-gh200-openai:latest \
+      --build-arg max_jobs=66 \
+      --build-arg nvcc_threads=2 \
+      --build-arg torch_cuda_arch_list="9.0+PTX" \
+      --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+
+
 
 To run vLLM:
 
diff --git a/requirements-build.txt b/requirements-build.txt
index fec01caaf25ef..388b193403e88 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.5.1; platform_machine != 'aarch64'
 wheel
 jinja2
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
new file mode 100644
index 0000000000000..bbcb5cb7012ce
--- /dev/null
+++ b/requirements-cuda-arm64.txt
@@ -0,0 +1,3 @@
+--index-url https://download.pytorch.org/whl/nightly/cu124
+torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
+torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 058ab7c1ee9df..5d4dee8c7129a 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1
+torch == 2.5.1; platform_machine != 'aarch64'
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1

From bddbbcb132429084ede62855bcd6a1023a3645c1 Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Mon, 16 Dec 2024 11:56:19 +0200
Subject: [PATCH 63/85] [Model] Support Cohere2ForCausalLM (Cohere R7B)
 (#11203)

---
 docs/source/models/supported_models.rst |  4 ++--
 tests/models/registry.py                |  2 ++
 tests/models/test_initialization.py     |  4 ++++
 vllm/model_executor/models/commandr.py  | 19 +++++++++++++++++--
 vllm/model_executor/models/registry.py  |  1 +
 5 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index cae4a88de1638..3bef3f3226062 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -118,9 +118,9 @@ Text Generation (``--task generate``)
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
     - ✅︎
     - ✅︎
-  * - :code:`CohereForCausalLM`
+  * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM`
     - Command-R
-    - :code:`CohereForAI/c4ai-command-r-v01`, etc.
+    - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
     - ✅︎
     - ✅︎
   * - :code:`DbrxForCausalLM`
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 6a8b1742ceae3..fac8c4b2e9b19 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -53,6 +53,8 @@ class _HfExamplesInfo:
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
+    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
+                                         trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
     "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
                                          trust_remote_code=True),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 3b728f2744fca..a4eea7f035c91 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import pytest
+import transformers
 from transformers import PretrainedConfig
 
 from vllm import LLM
@@ -11,6 +12,9 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    if (model_arch == "Cohere2ForCausalLM"
+            and transformers.__version__ < "4.48.0"):
+        pytest.skip(reason="Model introduced in HF >= 4.48.0")
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
 
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 85e24ca660686..c846e42f1b0c3 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -48,7 +48,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -171,12 +171,26 @@ def __init__(
             rope_scaling=self.rope_scaling,
             is_neox_style=False,
         )
+
+        sliding_window = getattr(config, "sliding_window", None)
+        # Model v2 has sliding windows, v1 does not
+        self.v1 = sliding_window is None
+
+        layer_idx = extract_layer_index(prefix)
+        layer_has_sliding_window = (
+            getattr(config, "sliding_window_pattern", False)
+            and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
+
+        self.sliding_window = (sliding_window
+                               if layer_has_sliding_window else None)
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
+                              per_layer_sliding_window=self.sliding_window,
                               prefix=f"{prefix}.attn")
         if self.use_qk_norm:
             self.q_norm = LayerNorm(param_shape=(self.num_heads,
@@ -206,7 +220,8 @@ def forward(
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         if self.use_qk_norm:
             q, k = self._apply_qk_norm(q, k)
-        q, k = self.rotary_emb(positions, q, k)
+        if self.v1 or self.sliding_window:
+            q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
         return output
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4e77746f312e3..68a2467a813a1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -41,6 +41,7 @@
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
+    "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),

From d927dbcd889fb2476cb61ea477ff51e5dd9e1ae3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 16 Dec 2024 18:09:53 +0800
Subject: [PATCH 64/85] [Model] Refactor Ultravox to use merged input processor
 (#11198)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 examples/offline_inference_audio_language.py  |  10 +-
 tests/distributed/test_pipeline_parallel.py   |   2 +-
 tests/entrypoints/openai/test_audio.py        |   1 +
 .../audio_language/test_ultravox.py           |   5 +-
 vllm/entrypoints/chat_utils.py                |   2 +-
 vllm/model_executor/models/ultravox.py        | 244 ++++++++----------
 vllm/multimodal/processing.py                 |  19 +-
 7 files changed, 129 insertions(+), 154 deletions(-)

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 050b791b62adb..68b786961b14a 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -25,16 +25,16 @@ def run_ultravox(question: str, audio_count: int):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
-        'role':
-        'user',
-        'content':
-        "<|reserved_special_token_0|>\n" * audio_count + question
+        'role': 'user',
+        'content': "<|audio|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 85d408efafe96..ddbf40f089407 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -214,7 +214,7 @@ def iter_params(self, model_name: str):
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True),
     # [Encoder-decoder]
     # TODO: Implement PP
     # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a74109e2f5120..b579dcbb5c402 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -25,6 +25,7 @@ def server():
         "--max-num-seqs",
         "5",
         "--enforce-eager",
+        "--trust-remote-code",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index e100c6b9bb906..c548cfdf53414 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -16,7 +16,7 @@
 
 AudioTuple = Tuple[np.ndarray, int]
 
-VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+VLLM_PLACEHOLDER = "<|audio|>"
 HF_PLACEHOLDER = "<|audio|>"
 
 CHUNKED_PREFILL_KWARGS = {
@@ -46,7 +46,8 @@ def audio(request):
 def server(request, audio_assets):
     args = [
         "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}"
+        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
+        "--trust-remote-code"
     ] + [
         f"--{key.replace('_','-')}={value}"
         for key, value in request.param.items()
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c2054dcbfce0e..aaa5cd759366a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -418,7 +418,7 @@ def _placeholder_str(self, modality: ModalityStr,
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
             if model_type == "ultravox":
-                return "<|reserved_special_token_0|>"
+                return "<|audio|>"
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ea1e5401d42c0..ebaa8a4c4f38a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,41 +3,39 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union, cast)
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
+from transformers import BatchFeature
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             NestedTensors)
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataDict,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings_from_map)
 
-_AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
 
 
@@ -72,64 +70,18 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
     return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
 
 
-def dummy_seq_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    audio_count: int,
-):
-    audio_length = min(get_ultravox_max_audio_tokens(ctx),
-                       seq_len // audio_count)
+class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
-    return SequenceData.from_prompt_token_counts(
-        (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count),
-        (0, seq_len - audio_length * audio_count)), {
-            "audio":
-            consecutive_placeholder_ranges(num_items=audio_count,
-                                           item_size=audio_length)
-        }
-
-
-def dummy_audio_for_ultravox(
-    ctx: InputContext,
-    audio_count: int,
-):
-    feature_extractor = whisper_feature_extractor(ctx)
-    audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
-    return {"audio": [audio_and_sr] * audio_count}
-
-
-def dummy_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-):
-    audio_count = mm_counts["audio"]
-    seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
-    mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
-
-    return DummyData(seq_data, mm_dict, ranges)
-
-
-def input_mapper_for_ultravox(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalKwargs()
-
-    # If the audio inputs are embeddings, no need for preprocessing
-    if is_list_of(data, torch.Tensor, check="all"):
-        return MultiModalKwargs({"audio_embeds": data})
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input)
-        feature_extractor = whisper_feature_extractor(ctx)
+    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
+        return self._get_hf_processor().audio_processor.feature_extractor
 
+    def _resample_audio(
+        self,
+        audio: np.ndarray,
+        sr: int,
+    ) -> Dict[str, Union[np.ndarray, int]]:
+        # resample audio to the model's sampling rate
+        feature_extractor = self._get_feature_extractor()
         if sr != feature_extractor.sampling_rate:
             try:
                 import librosa
@@ -140,78 +92,92 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)
             sr = feature_extractor.sampling_rate
+        return {"audio": audio, "sampling_rate": sr}
 
-        minimum_audio_length = feature_extractor.n_fft // 2 + 1
-        if len(audio) < minimum_audio_length:
-            # Not enough audio; pad it.
-            audio = np.pad(audio, (0, minimum_audio_length - len(audio)))
-
-        single_audio_features = feature_extractor(
-            audio, sampling_rate=sr, padding="longest",
-            return_tensors="pt")["input_features"]
-
-        # Remove the batch dimension because we're wrapping it in a list.
-        audio_features.append(single_audio_features.squeeze(0))
-
-    return MultiModalKwargs({"audio_features": audio_features})
-
-
-def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "audio" not in multi_modal_data:
-        return inputs
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data or not mm_data.get("audio", None):
+            return super()._apply_hf_processor(prompt, mm_data,
+                                               mm_processor_kwargs)
+
+        audio_data = mm_data["audio"]
+        if not isinstance(audio_data, list):
+            audio_data = [audio_data]
+
+        # Ultravox processor doesn't support multiple inputs,
+        # therefore we need to input text and audio one by one
+        tokenizer = self._get_tokenizer()
+        audio_features, audio_token_len = [], []
+        processed_inputs = {}
+        for audio, sr in audio_data:
+            data = self._resample_audio(audio, sr)
+            processed_inputs = super()._apply_hf_processor(
+                prompt, data, mm_processor_kwargs)
+            prompt = tokenizer.decode(processed_inputs["input_ids"][0],
+                                      skip_special_tokens=False)
+            audio_features.append(
+                processed_inputs.pop("audio_values").squeeze(0))
+            audio_token_len.append(
+                processed_inputs.pop("audio_token_len").item())
+
+        return dict(
+            **processed_inputs,
+            audio_features=audio_features,
+            audio_token_len=audio_token_len,
+        )
 
-    if "multi_modal_placeholders" in inputs and "audio" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
+    def _get_processor_data(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        # Ultravox uses "audio" instead of "audios" as calling keyword
+        processor_data, passthrough_data = super()._get_processor_data(mm_data)
+        if "audios" in processor_data:
+            processor_data["audio"] = processor_data.pop("audios")
+        return processor_data, passthrough_data
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        placeholder = hf_processor.audio_token_replacement
+
+        def get_replacement_ultravox(item_idx: int):
+            audio_token_len = hf_inputs["audio_token_len"][item_idx]
+            return placeholder * audio_token_len
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target="<|audio|>",
+                replacement=get_replacement_ultravox,
+            )
+        ]
 
-    feature_extractor = whisper_feature_extractor(ctx)
-    audios = multi_modal_data["audio"]
-    if not isinstance(audios, list):
-        audios = [audios]
-
-    audio_token_counts = []
-    for audio in audios:
-        if isinstance(audio, torch.Tensor):
-            audio_num_tokens = audio.shape[1]
-            audio_token_counts.append(audio_num_tokens)
-        else:
-            audio_data, sample_rate = audio
-            audio_length = audio_data.shape[0]
-            if sample_rate != feature_extractor.sampling_rate:
-                # Account for resampling.
-                adjustment = feature_extractor.sampling_rate / sample_rate
-                audio_length = math.ceil(adjustment * audio_length)
-
-            feature_extractor_output_length = math.ceil(
-                (audio_length - (feature_extractor.hop_length - 1)) /
-                feature_extractor.hop_length)
-
-            uv_config = ctx.get_hf_config(UltravoxConfig)
-            audio_num_tokens = min(
-                max(
-                    1,
-                    math.ceil(feature_extractor_output_length /
-                              (uv_config.stack_factor * 2))),
-                get_ultravox_max_audio_tokens(ctx))
-            audio_token_counts.append(audio_num_tokens)
-
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN,
-        repeat_count=audio_token_counts,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"audio": ranges})
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+
+        audio_count = mm_counts["audio"]
+        audio = np.zeros(audio_len)
+        data = {"audio": [(audio, sampling_rate)] * audio_count}
+
+        return ProcessorInputs(
+            prompt_text="<|audio|>" * audio_count,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 class StackAudioFrames(nn.Module):
@@ -332,11 +298,9 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_ultravox)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_ultravox_max_audio_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_ultravox)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ce6bec1d49aac..339e193eefe20 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -594,14 +594,10 @@ def _find_placeholders(
         return list(
             iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
-    def _apply_hf_processor(
+    def _get_processor_data(
         self,
-        prompt: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
-
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
         for k, v in mm_data.items():
@@ -619,6 +615,19 @@ def _apply_hf_processor(
                     processor_data[f"{k}s"] = v
             else:
                 processor_data[k] = v
+        return processor_data, passthrough_data
+
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # some mm_processor_kwargs may be used in processor initialization
+        # instead of processor call
+        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
+
+        processor_data, passthrough_data = self._get_processor_data(mm_data)
 
         assert callable(hf_processor)
         mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(

From 2ca830dbaa1a7c30b8ff4d7c860c63f87dc18be3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 16 Dec 2024 19:23:33 +0800
Subject: [PATCH 65/85] [Doc] Reorder vision language examples in alphabet
 order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference_vision_language.py | 486 +++++++++---------
 ...e_inference_vision_language_multi_image.py | 288 +++++------
 2 files changed, 387 insertions(+), 387 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 7bc43242b717e..6d0495fdd4054 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -19,6 +19,159 @@
 # Unless specified, these settings have been tested to work on a single L4.
 
 
+# Aria
+def run_aria(question: str, modality: str):
+    assert modality == "image"
+    model_name = "rhymes-ai/Aria"
+
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+              "<|im_end|>\n<|im_start|>assistant\n")
+
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return llm, prompt, stop_token_ids
+
+
+# BLIP-2
+def run_blip2(question: str, modality: str):
+    assert modality == "image"
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Chameleon
+def run_chameleon(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Fuyu
+def run_fuyu(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b",
+              max_model_len=2048,
+              max_num_seqs=2,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# GLM-4v
+def run_glm4v(question: str, modality: str):
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
+
+    llm = LLM(model=model_name,
+              max_model_len=2048,
+              max_num_seqs=2,
+              trust_remote_code=True,
+              enforce_eager=True,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    prompt = question
+    stop_token_ids = [151329, 151336, 151338]
+    return llm, prompt, stop_token_ids
+
+
+# H2OVL-Mississippi
+def run_h2ovl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return llm, prompt, stop_token_ids
+
+
+# Idefics3-8B-Llama3
+def run_idefics3(question: str, modality: str):
+    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = (
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    )
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# InternVL
+def run_internvl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "OpenGVLab/InternVL2-2B"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return llm, prompt, stop_token_ids
+
+
 # LLaVA-1.5
 def run_llava(question: str, modality: str):
     assert modality == "image"
@@ -75,83 +228,20 @@ def run_llava_onevision(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# Fuyu
-def run_fuyu(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b",
-              max_model_len=2048,
-              max_num_seqs=2,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Phi-3-Vision
-def run_phi3v(question: str, modality: str):
+# Mantis
+def run_mantis(question: str, modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
 
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
-        max_num_seqs=2,
-        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={"num_crops": 16},
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# PaliGemma
-def run_paligemma(question: str, modality: str):
-    assert modality == "image"
-
-    # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# PaliGemma 2
-def run_paligemma2(question: str, modality: str):
-    assert modality == "image"
-
-    # PaliGemma 2 has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Chameleon
-def run_chameleon(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b",
-              max_model_len=4096,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
+    stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
 
 
@@ -199,58 +289,45 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+# LLama 3.2
+def run_mllama(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
     llm = LLM(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
-    stop_token_ids = [tokenizer.eos_token_id]
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# InternVL
-def run_internvl(question: str, modality: str):
+# Molmo
+def run_molmo(question, modality):
     assert modality == "image"
 
-    model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "allenai/Molmo-7B-D-0924"
 
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
-        max_model_len=4096,
+        dtype="bfloat16",
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    prompt = question
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
@@ -279,58 +356,57 @@ def run_nvlm_d(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# BLIP-2
-def run_blip2(question: str, modality: str):
+# PaliGemma
+def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
-    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
-    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
-    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+    # PaliGemma has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma-3b-mix-224",
               mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen
-def run_qwen_vl(question: str, modality: str):
+# PaliGemma 2
+def run_paligemma2(question: str, modality: str):
     assert modality == "image"
 
-    llm = LLM(
-        model="Qwen/Qwen-VL",
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
-    )
-
-    prompt = f"{question}Picture 1: <img></img>\n"
+    # PaliGemma 2 has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+# Phi-3-Vision
+def run_phi3v(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model=model_name,
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },
+        mm_processor_kwargs={"num_crops": 16},
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -352,149 +428,73 @@ def run_pixtral_hf(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# LLama 3.2
-def run_mllama(question: str, modality: str):
-    assert modality == "image"
-
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (131072) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
-    )
-
-    prompt = f"<|image|><|begin_of_text|>{question}"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Molmo
-def run_molmo(question, modality):
+# Qwen
+def run_qwen_vl(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "allenai/Molmo-7B-D-0924"
-
     llm = LLM(
-        model=model_name,
+        model="Qwen/Qwen-VL",
         trust_remote_code=True,
-        dtype="bfloat16",
+        max_model_len=1024,
+        max_num_seqs=2,
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    prompt = question
+    prompt = f"{question}Picture 1: <img></img>\n"
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# GLM-4v
-def run_glm4v(question: str, modality: str):
+# Qwen2-VL
+def run_qwen2_vl(question: str, modality: str):
     assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
 
-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    prompt = question
-    stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
-
-
-# Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
-    assert modality == "image"
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     llm = LLM(
         model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        enforce_eager=True,
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        max_model_len=4096,
+        max_num_seqs=5,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={
-            "size": {
-                "longest_edge": 3 * 364
-            },
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
         },
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
 
-# Aria
-def run_aria(question: str, modality: str):
-    assert modality == "image"
-    model_name = "rhymes-ai/Aria"
-
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-
-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
-              "<|im_end|>\n<|im_start|>assistant\n")
-
-    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompt, stop_token_ids
-
-
-# Mantis
-def run_mantis(question: str, modality: str):
-    assert modality == "image"
-
-    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompt = llama3_template.format(f"{question}\n<image>")
-
-    llm = LLM(
-        model="TIGER-Lab/Mantis-8B-siglip-llama3",
-        max_model_len=4096,
-        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
-    )
-    stop_token_ids = [128009]
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
 model_example_map = {
+    "aria": run_aria,
+    "blip-2": run_blip2,
+    "chameleon": run_chameleon,
+    "fuyu": run_fuyu,
+    "glm4v": run_glm4v,
+    "h2ovl_chat": run_h2ovl,
+    "idefics3": run_idefics3,
+    "internvl_chat": run_internvl,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
     "llava-onevision": run_llava_onevision,
-    "fuyu": run_fuyu,
-    "phi3_v": run_phi3v,
-    "paligemma": run_paligemma,
-    "paligemma2": run_paligemma2,
-    "chameleon": run_chameleon,
+    "mantis": run_mantis,
     "minicpmv": run_minicpmv,
-    "blip-2": run_blip2,
-    "h2ovl_chat": run_h2ovl,
-    "internvl_chat": run_internvl,
+    "mllama": run_mllama,
+    "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
+    "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
+    "phi3_v": run_phi3v,
+    "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
-    "pixtral_hf": run_pixtral_hf,
-    "mllama": run_mllama,
-    "molmo": run_molmo,
-    "glm4v": run_glm4v,
-    "idefics3": run_idefics3,
-    "aria": run_aria,
-    "mantis": run_mantis,
 }
 
 
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 928bbef54eab7..6af8d7768e75d 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -33,78 +33,23 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
-    model_name = "Qwen/Qwen-VL-Chat"
-    llm = LLM(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-    placeholders = "".join(f"Picture {i}: <img></img>\n"
-                           for i, _ in enumerate(image_urls, start=1))
-
-    # This model does not have a chat_template attribute on its tokenizer,
-    # so we need to explicitly pass it. We use ChatML since it's used in the
-    # generation utils of the model:
-    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-
-    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
-    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
-
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=chat_template)
-
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=stop_token_ids,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=chat_template,
-    )
-
-
-def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,
-        max_model_len=4096,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        mm_processor_kwargs={"num_crops": 4},
-    )
-    placeholders = "\n".join(f"<|image_{i}|>"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
-    stop_token_ids = None
-
+def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "rhymes-ai/Aria"
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              limit_mm_per_prompt={"image": len(image_urls)})
+    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
+    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
+        chat_template=None)
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -141,6 +86,37 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 2 * 364
+            },
+        },
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -178,6 +154,28 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_nvlm_d(question: str, image_urls: List[str]):
     model_name = "nvidia/NVLM-D-72B"
 
@@ -211,6 +209,80 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
+    )
+    placeholders = "\n".join(f"<|image_{i}|>"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
+def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "Qwen/Qwen-VL-Chat"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=1024,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "".join(f"Picture {i}: <img></img>\n"
+                           for i, _ in enumerate(image_urls, start=1))
+
+    # This model does not have a chat_template attribute on its tokenizer,
+    # so we need to explicitly pass it. We use ChatML since it's used in the
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=chat_template)
+
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=chat_template,
+    )
+
+
 def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
@@ -268,88 +340,16 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
-def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
-        mm_processor_kwargs={
-            "size": {
-                "longest_edge": 2 * 364
-            },
-        },
-    )
-
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
-def load_aria(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "rhymes-ai/Aria"
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16",
-              limit_mm_per_prompt={"image": len(image_urls)})
-    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
-    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
-    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=stop_token_ids,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None)
-
-
 model_example_map = {
-    "phi3_v": load_phi3v,
+    "aria": load_aria,
     "h2ovl_chat": load_h2onvl,
+    "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
-    "qwen2_vl": load_qwen2_vl,
+    "phi3_v": load_phi3v,
     "qwen_vl_chat": load_qwenvl_chat,
-    "mllama": load_mllama,
-    "idefics3": load_idefics3,
-    "aria": load_aria,
+    "qwen2_vl": load_qwen2_vl,
 }
 
 

From efbce85f4d375d7851a491a0126a224e25d9f91d Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 16 Dec 2024 13:14:57 -0500
Subject: [PATCH 66/85] [misc] Layerwise profile updates (#10242)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 examples/offline_profile.py                   | 236 +++++++++++++++---
 tools/profiler/print_layerwise_table.py       |   9 +-
 tools/profiler/visualize_layerwise_profile.py |  92 ++++++-
 vllm/profiler/layerwise_profile.py            |  22 +-
 5 files changed, 314 insertions(+), 47 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 97aae233db105..44f47fac1c1b3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -201,7 +201,7 @@ steps:
     - python3 offline_inference_classification.py
     - python3 offline_inference_embedding.py
     - python3 offline_inference_scoring.py
-    - python3 offline_profile.py --model facebook/opt-125m
+    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/examples/offline_profile.py b/examples/offline_profile.py
index 1d415b82cddb6..46afe8aa2604b 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_profile.py
@@ -4,9 +4,10 @@
 import sys
 from argparse import RawTextHelpFormatter
 from dataclasses import asdict, dataclass
-from typing import Optional
+from typing import Any, Dict, Generator, List, Optional, TypeAlias
 
 import torch
+import tqdm
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -15,16 +16,21 @@
 
 BATCH_SIZE_DEFAULT = 1
 PROMPT_LEN_DEFAULT = 256
-OUTPUT_LEN_DEFAULT = 2
 
 
 @dataclass
 class ProfileContext:
     engine_args: EngineArgs
     prompt_len: int
-    output_len: int
     batch_size: int
-    save_chrome_traces_folder: Optional[str]
+
+    # The profiler can run in 2 modes,
+    # 1. Run profiler for user specified num_steps
+    num_steps: Optional[int] = None
+    # 2. Run profiler until all requests complete
+    complete_num_requests_per_step: Optional[int] = None
+
+    save_chrome_traces_folder: Optional[str] = None
 
 
 def get_dtype(dtype: str):
@@ -34,23 +40,155 @@ def get_dtype(dtype: str):
         return dtype
 
 
+OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+      -> OutputLen_NumReqs_Map:
+    """
+    Given the number of requests, batch_size, and the number of requests
+    that each engine-step should process, step_requests, determine the
+    output lengths of the requests such that step_request is honoured.
+
+    Example: 
+    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
+    then return,
+    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
+    32 requests should have output length 2,
+    32 requests should have output length 3,
+    32 requests should have output length 4,
+    31 requests should have output length 5,
+    1 request should have output length 6.
+
+    Args:
+        batch_size (int): Number of requests submitted for profile. This is
+            args.batch_size.
+        step_requests (List[int]): step_requests[i] is the number of requests
+            that the ith engine step should process.
+
+    Returns:
+        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
+            number of requests required to have that output-length as values.
+    """
+    ol_nr: OutputLen_NumReqs_Map = {}
+
+    # Number of request that are assigned an output-length
+    num_reqs_assigned: int = 0
+    num_steps: int = len(step_requests)
+
+    # sanity check. The first step (prefill-step), must process all requests.
+    assert step_requests[0] == batch_size
+
+    # Begin assignments from the last step.
+    output_length: int = num_steps
+    for num_requests_at_step in reversed(step_requests):
+        if num_reqs_assigned == batch_size:
+            break
+
+        assert num_reqs_assigned < batch_size
+
+        # Remove the number of requests that have been determined
+        # to participate in this step and beyond.
+        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
+        assert num_reqs_unassigned_at_step >= 0
+
+        if num_reqs_unassigned_at_step > 0:
+            ol_nr[output_length] = num_reqs_unassigned_at_step
+            num_reqs_assigned += num_reqs_unassigned_at_step
+
+        output_length -= 1
+
+    # sanity checks.
+    assert sum(ol_nr.values()) == batch_size, \
+            ("Number of requests in output-length assignment does not match "
+             f"batch-size.\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    # Check that the output-length is in [1, num-steps]. Output length must be
+    # at least 1 as all requests must participate in the prefill-step.
+    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
+            ("Output lengths of requests should be in range "
+             f"[1, num-engine-steps].\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    return ol_nr
+
+
+def determine_requests_per_step(context: ProfileContext) -> List[int]:
+    """
+    Determine number of requests each engine step should process.
+    If context.num_steps is set, then all engine steps process the
+    same number of requests and the output list is of length
+    context.num_steps.
+
+    If context.complete_num_requests_per_step is set, then each decode step
+    processes fewer and fewer requests until there are no requests to process.
+    In this case, the output list is as big as the number of steps
+    required to process all requests.
+
+    Args:
+        context: ProfileContext object.
+
+    Returns:
+        List[int]: Number of requests to process for all engine-steps. 
+         output[i], contains the number of requests that the ith step
+         should process.
+    """
+    if context.num_steps:
+        # All requests must run until num_engine_steps. This implies
+        # that their output lengths must be equal to num_engine_steps.
+        return [context.batch_size] * context.num_steps
+
+    assert context.complete_num_requests_per_step and \
+                context.complete_num_requests_per_step > 0, \
+        (f"Expected a positive complete_num_requests_per_step argument."
+         f"Instead got {context.complete_num_requests_per_step}")
+
+    # We start dropping after the first decode step.
+    step_requests = [
+        context.batch_size,  # prefill
+        context.batch_size,  # decode
+    ]
+
+    num_running_requests = context.batch_size
+    num_running_requests -= context.complete_num_requests_per_step
+    while num_running_requests > 0:
+        step_requests.append(num_running_requests)
+        num_running_requests -= context.complete_num_requests_per_step
+
+    if step_requests[-1] != 1:
+        # have 1 request running at the last step. This is often
+        # useful
+        step_requests.append(1)
+
+    return step_requests
+
+
 def run_profile(context: ProfileContext, csv_output: Optional[str],
                 json_output: Optional[str]):
     print("Run profile with:")
     for key, value in asdict(context).items():
         print(f"  {key} = {value}")
 
+    requests_per_step: List[int] = determine_requests_per_step(context)
+
+    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
+        context.batch_size, requests_per_step)
+
+    num_steps_to_profile: int = len(requests_per_step)
+    max_output_len: int = max(ol_nr.keys())
+    assert max_output_len >= 1
+
     # Create sampling params
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=args.output_len,
-                                     ignore_eos=True)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        # max_tokens is set on a per-request basis.
+        max_tokens=None,
+        ignore_eos=True)
 
     # Create LLM
     llm = LLM(**asdict(context.engine_args))
     batch_size = context.batch_size
     prompt_len = context.prompt_len
-    output_len = context.output_len
 
     scheduler_config = llm.llm_engine.scheduler_config
     max_model_len = llm.llm_engine.model_config.max_model_len
@@ -65,7 +203,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
               f"choose a smaller batch size or prompt length, or increase "
               f"--max-num-batched-tokens")
         sys.exit(-1)
-    if batch_size >= max_num_seqs:
+    if batch_size > max_num_seqs:
         print(
             f"ERROR: chosen batch_size ({batch_size}) is larger than "
             f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
@@ -73,16 +211,26 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
         sys.exit(-1)
     print("llm.llm_engine.model_config.max_model_len: ",
           llm.llm_engine.model_config.max_model_len)
-    if prompt_len + output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + output_len ({prompt_len} + "
-            f"{output_len} = {prompt_len + output_len}) is larger than the "
-            f"model's max_model_len ({max_model_len}), please choose a smaller "
-            f"prompt_len or output_len, or increase --max-model-len")
+    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
+        print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
+              f"{max_output_len} = {prompt_len + max_output_len}) is larger "
+              f"than the model's max_model_len ({max_model_len}), please "
+              f"choose a smaller prompt_len or max_output_len, or increase "
+              f"--max-model-len")
         sys.exit(-1)
 
     def add_requests():
+
+        def get_output_len_generator() -> Generator[int, Any, Any]:
+            for output_len, num_reqs in ol_nr.items():
+                for _ in range(num_reqs):
+                    yield output_len
+
+        output_len_generator = get_output_len_generator()
         for i in range(batch_size):
+            sampling_params.max_tokens = next(output_len_generator)
+            assert isinstance(sampling_params.max_tokens, int)
+
             prompt_token_ids = torch.randint(
                 llm.llm_engine.model_config.get_vocab_size(),
                 size=(prompt_len, )).tolist()
@@ -110,8 +258,11 @@ def abort_requests():
         llm.llm_engine.step()  # First step is prefill
 
     decode_profs = []
-    for x in range(args.output_len - 1):
-        with layerwise_profile() as decode_prof:
+    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
+        num_running_seqs = llm.llm_engine.scheduler[
+            0].get_num_unfinished_seq_groups()
+        with layerwise_profile(
+                num_running_seqs=num_running_seqs) as decode_prof:
             llm.llm_engine.step()
         decode_profs.append(decode_prof)
 
@@ -154,7 +305,8 @@ def abort_requests():
         decode_results_list[0].print_summary_table()
 
     if csv_output:
-        csv_filename_base = csv_output.rstrip(".csv")
+        csv_filename_base = csv_output[:-4] \
+                if csv_output.endswith('.csv') else csv_output
         prefill_results.export_model_stats_table_csv(
             csv_filename_base + "_prefill_model_table.csv")
         prefill_results.export_summary_stats_table_csv(
@@ -187,10 +339,10 @@ def abort_requests():
             for idx, dr in enumerate(decode_results_list):
                 json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
 
-        for idx, dr in enumerate(decode_results_list[1:]):
-            json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        with open(json_output.rstrip(".json") + ".json", "w+") as f:
+        # Add .json to json_output filename if it doesn't exist already.
+        json_output_file = json_output if json_output.endswith(
+            '.json') else json_output + '.json'
+        with open(json_output_file, "w+") as f:
             json.dump(json_dict, f, indent=2)
         pass
 
@@ -214,7 +366,7 @@ def abort_requests():
     python examples/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager
+        --enforce-eager run_num_steps -n 2
     ```
 
     then you can use various tools to analyze the json output
@@ -261,17 +413,41 @@ def abort_requests():
                         default=BATCH_SIZE_DEFAULT,
                         help=f"Number of requests to run as a single batch, "
                         f"default={BATCH_SIZE_DEFAULT}")
-    parser.add_argument(
-        "--output-len",
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    run_num_steps_parser = subparsers.add_parser(
+        "run_num_steps",
+        help="This variation profiles n engine.step() invocations.")
+    run_num_steps_parser.add_argument(
+        '-n',
+        '--num-steps',
         type=int,
-        default=OUTPUT_LEN_DEFAULT,
-        help="Number of llm steps to run (includes prefill and decode) "
-        "- default={OUTPUT_LEN_DEFAULT}")
+        help="Number of engine steps to profile.\n"
+        "Setting it to 1, profiles only the prefill step.\n"
+        "Setting it to 2, profiles the prefill and first decode step\n"
+        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
+        "and so on ...")
+
+    run_to_completion_parser = subparsers.add_parser(
+        "run_to_completion",
+        help="This variation profiles all the engine.step() invocations"
+        "until the engine exhausts all submitted requests.")
+    run_to_completion_parser.add_argument(
+        '-n',
+        '--complete-num-requests-per-step',
+        type=int,
+        help=
+        "Complete complete_num_requests_per_step requests every decode step."
+        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
+        "the profiler is run for 6 engine steps, with the steps processing, "
+        "128, 128, 96, 64, 32, 1 requests respectively.\n"
+        "Note that we tack-on a one-request step at the end as it is often "
+        "useful.")
 
     EngineArgs.add_cli_args(parser)
 
     args = parser.parse_args()
-
     context = ProfileContext(
         engine_args=EngineArgs.from_cli_args(args),
         **{
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 081076ad7dbdc..394ca8663e189 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -34,9 +34,10 @@ def get_entries(node, curr_depth=0):
                         "examples/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
-                        choices=["prefill", "decode_1"],
                         required=True,
-                        help="The phase to print the table for.")
+                        help="The phase to print the table for. This is either"
+                        "prefill or decode_n, where n is the decode step "
+                        "number")
     parser.add_argument("--table",
                         type=str,
                         choices=["summary", "model"],
@@ -49,6 +50,10 @@ def get_entries(node, curr_depth=0):
     with open(args.json_trace) as f:
         profile_data = json.load(f)
 
+    assert args.phase in profile_data, \
+       (f"Cannot find phase {args.phase} in profile data. Choose one among"
+        f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa
+
     if args.table == "summary":
         entries_and_depths = flatten_entries(
             SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index adc44474aa4c1..da7a28da15c19 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -151,16 +151,31 @@ def is_quant(op_name: str):
            "scaled_int8_quant" in op_name:
             return True
 
+    # LoRA ops
+    def is_sgmv_shrink(op_name: str):
+        return "sgmv_shrink" in op_name
+
+    def is_sgmv_expand(op_name: str):
+        return "sgmv_expand" in op_name
+
+    def is_bgmv_shrink(op_name: str):
+        return "bgmv_shrink" in op_name
+
+    def is_bgmv_expand(op_name: str):
+        return "bgmv_expand" in op_name
+
+    def is_cutlass_gemm_op(op_name: str):
+        return "void cutlass::Kernel" in op_name or \
+           "void cutlass::device_kernel" in op_name
+
     def is_gemm_op(op_name: str):
         if is_quant(op_name):
             return False
-        if "xmma_gemm" in op_name  or \
+        return is_cutlass_gemm_op(op_name) or \
+           "xmma_gemm" in op_name  or \
            "gemv2T_kernel" in op_name or \
            "splitKreduce" in op_name or \
-           "void cutlass::Kernel" in op_name or \
-           "void cutlass::device_kernel" in op_name or \
-           "s16816gemm" in op_name:
-            return True
+           "s16816gemm" in op_name
 
     def is_elementwise_op(op_name: str):
         return "elementwise_kernel" in op_name
@@ -211,6 +226,18 @@ def is_reduce_kernel(op_name: str):
     quant_ops = list(filter(lambda x: is_quant(x), ops))
     ops = list(filter(lambda x: x not in quant_ops, ops))
 
+    sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops))
+    sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_expand_ops, ops))
+    bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops))
+    bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_expand_ops, ops))
+
+    cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops))
+    ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops))
+
     gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
     ops = list(filter(lambda x: x not in gemm_ops, ops))
 
@@ -257,6 +284,24 @@ def is_reduce_kernel(op_name: str):
         trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
     if len(quant_ops):
         trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
+
+    if len(sgmv_shrink_ops):
+        trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(sgmv_expand_ops):
+        trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_shrink_ops):
+        trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_expand_ops):
+        trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+
+    if len(cutlass_gemm_ops):
+        trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
+                                                                      axis=1)
+
     if len(gemm_ops):
         trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
     if len(rms_norm_ops):
@@ -296,7 +341,9 @@ def is_reduce_kernel(op_name: str):
         trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
                                                                         axis=1)
 
-    trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
+    trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
+                  sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
+                  cutlass_gemm_ops + gemm_ops + rms_norm_ops +
                   vocab_embed_ops + mem_ops + elementwise_ops +
                   nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
                   nccl_other_ops + cross_device_reduce_1stage_ops +
@@ -315,7 +362,14 @@ def plot_trace_df(traces_df: pd.DataFrame,
                   plot_title: str,
                   output: Optional[Path] = None):
 
+    def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
+        phase_df = traces_df.query(f'phase == "{phase}"')
+        descs = phase_df['phase_desc'].to_list()
+        assert all([desc == descs[0] for desc in descs])
+        return descs[0]
+
     phases = traces_df['phase'].unique()
+    phase_descs = [get_phase_description(traces_df, p) for p in phases]
     traces_df = traces_df.pivot_table(index="phase",
                                       columns="name",
                                       values=plot_metric,
@@ -324,7 +378,8 @@ def plot_trace_df(traces_df: pd.DataFrame,
     traces_df = group_trace_by_operations(traces_df)
 
     # Make the figure
-    fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True)
+    fig_size_x = max(5, len(phases))
+    fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True)
 
     # Draw the stacked bars
     ops = list(traces_df)
@@ -332,7 +387,7 @@ def plot_trace_df(traces_df: pd.DataFrame,
     for op in ops:
         values = [traces_df[op][phase] for phase in phases]
         values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
-        ax.bar(phases, values, label=op, bottom=bottom)
+        ax.bar(phase_descs, values, label=op, bottom=bottom)
         bottom = [bottom[j] + values[j] for j in range(len(phases))]
 
     # Write the values as text on the bars
@@ -390,6 +445,14 @@ def keep_only_top_entries(df: pd.DataFrame,
                    ["name"]] = "others"
             return df
 
+        def get_phase_description(key: str) -> str:
+            num_running_seqs = profile_json[key]['metadata'][
+                'num_running_seqs']
+            if num_running_seqs is not None:
+                return f"{key}-seqs-{num_running_seqs}"
+            else:
+                return key
+
         # Get data for each key
         traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
 
@@ -413,6 +476,7 @@ def keep_only_top_entries(df: pd.DataFrame,
         # Fill in information about the step-keys
         for trace_df, step_key in zip(trace_dfs, step_keys):
             trace_df['phase'] = step_key
+            trace_df['phase_desc'] = get_phase_description(step_key)
 
         # Combine all data frames so they can be put in a single plot
         traces_df = pd.concat(trace_dfs)
@@ -426,12 +490,16 @@ def keep_only_top_entries(df: pd.DataFrame,
     def make_plot_title_suffix(profile_json: dict) -> str:
         context = profile_json["context"]
         sparsity = context.get('sparsity', None)
-        return (f"{context['model']}\n"
+        run_type = \
+            f'Run {context["num_steps"]} steps' if context['num_steps'] else \
+                (f'Complete {context["complete_num_requests_per_step"]} per '
+                 f'step; Run till completion')
+        return (f"{context['engine_args']['model']}\n"
                 f"Batch={context['batch_size']}, "
                 f"PromptLen={context['prompt_len']}, "
-                f"OutputLen={context['output_len']},"
-                f"NumGpus={context['tensor_parallel_size']}"
-                f"{', Sparsity ' + sparsity if sparsity else ''}")
+                f"NumGpus={context['engine_args']['tensor_parallel_size']}"
+                f"{', Sparsity ' + sparsity if sparsity else ''}\n"
+                f"Run Type: {run_type}")
 
     profile_json = None
     with open(json_trace) as f:
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 9d9f427e807f6..33babfebdca1e 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -72,6 +72,9 @@ class LayerwiseProfileResults(profile):
     _model_stats_tree: List[_StatsTreeNode] = field(init=False)
     _summary_stats_tree: List[_StatsTreeNode] = field(init=False)
 
+    # profile metadata
+    num_running_seqs: Optional[int] = None
+
     def __post_init__(self):
         self._build_correlation_map()
         self._build_module_tree()
@@ -127,6 +130,9 @@ def export_summary_stats_table_csv(self, filename: str):
 
     def convert_stats_to_dict(self) -> str:
         return {
+            "metadata": {
+                "num_running_seqs": self.num_running_seqs
+            },
             "summary_stats":
             self._convert_stats_tree_to_dict(self._summary_stats_tree),
             "model_stats":
@@ -338,7 +344,15 @@ def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]):
 
 class layerwise_profile(profile):
 
-    def __init__(self):
+    def __init__(self, num_running_seqs: Optional[int] = None):
+        """
+        layerwise profile constructor.
+
+        Args:
+            num_running_seqs (Optional[int], optional): When given,
+            num_running_seqs will be passed to LayerProfileResults for metadata
+            update. Defaults to None.
+        """
         super().__init__(
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=True,
@@ -346,9 +360,13 @@ def __init__(self):
             with_modules=True,
             experimental_config=_ExperimentalConfig(verbose=True))
 
+        self.num_running_seqs = num_running_seqs
+
     def __enter__(self):
         return super().__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
-        self.results = LayerwiseProfileResults(self.profiler.kineto_results)
+        self.results = LayerwiseProfileResults(
+            self.profiler.kineto_results,
+            num_running_seqs=self.num_running_seqs)

From 551603feffd9b4ba98ccdd34e02e403e04db88c1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 13:32:25 -0800
Subject: [PATCH 67/85] [core] overhaul memory profiling and fix backward
 compatibility (#10511)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/entrypoints/llm/test_gpu_utilization.py |  25 ++++
 tests/entrypoints/llm/test_lazy_outlines.py   |   2 +-
 tests/test_utils.py                           |  44 +++++-
 tests/worker/test_profile.py                  |  18 +--
 vllm/engine/arg_utils.py                      |  11 +-
 vllm/utils.py                                 | 125 +++++++++++++++++-
 vllm/worker/multi_step_model_runner.py        |   3 +-
 vllm/worker/worker.py                         |  68 ++++------
 8 files changed, 236 insertions(+), 60 deletions(-)
 create mode 100644 tests/entrypoints/llm/test_gpu_utilization.py

diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
new file mode 100644
index 0000000000000..c2dab300ecefb
--- /dev/null
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -0,0 +1,25 @@
+from vllm import LLM, SamplingParams
+
+
+def test_gpu_memory_utilization():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # makes sure gpu_memory_utilization is per-instance limit,
+    # not a global limit
+    llms = [
+        LLM(model="facebook/opt-125m",
+            gpu_memory_utilization=0.3,
+            enforce_eager=True) for i in range(3)
+    ]
+    for llm in llms:
+        outputs = llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 2c53676c5f5dd..bf609b38a94f5 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -36,7 +36,7 @@ def run_lmfe(sample_regex):
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.6)
+              gpu_memory_utilization=0.3)
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     outputs = llm.generate(
         prompts=[
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a731b11eae81c..0bc9e5bc32a46 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,11 +5,13 @@
 from typing import AsyncIterator, Tuple
 
 import pytest
+import torch
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, merge_async_iterators, supports_kw)
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
-from .utils import error_on_warning
+from .utils import error_on_warning, fork_new_process_for_each_test
 
 
 @pytest.mark.asyncio
@@ -270,3 +272,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
         requires_kw_only=requires_kw_only,
         allow_var_kwargs=allow_var_kwargs
     ) == is_supported
+
+
+@fork_new_process_for_each_test
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_memory_in_bytes = \
+        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
+
+    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
+
+    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
+    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # Check that the memory usage is within 5% of the expected values
+    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
+    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert abs(torch_peak_ratio - 1) <= 0.05
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 194ea2aa506f4..79233c75714de 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -31,10 +31,6 @@ def test_gpu_memory_profiling():
         is_driver_worker=True,
     )
 
-    # Load the model so we can profile it
-    worker.init_device()
-    worker.load_model()
-
     # Set 10GiB as the total gpu ram to be device-agnostic
     def mock_mem_info():
         current_usage = torch.cuda.memory_stats(
@@ -46,20 +42,24 @@ def mock_mem_info():
 
     from unittest.mock import patch
     with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        # Load the model so we can profile it
+        worker.init_device()
+        worker.load_model()
         gpu_blocks, _ = worker.determine_num_available_blocks()
 
-    # Peak vram usage by torch should be 0.7077 GiB
+    # Peak vram usage by torch should be 0.47 GiB
+    # Model weights take 0.25 GiB
     # No memory should be allocated outside of torch
     # 9.0 GiB should be the utilization target
-    # 8.2923 GiB should be available for the KV cache
+    # 8.28 GiB should be available for the KV cache
     block_size = CacheEngine.get_cache_block_size(
         engine_config.cache_config, engine_config.model_config,
         engine_config.parallel_config)
 
-    expected_blocks = (8.2923 * 1024**3) // block_size
+    expected_blocks = (8.28 * 1024**3) // block_size
 
     # Check within a small tolerance for portability
     # Hardware, kernel, or dependency changes could all affect memory
     # utilization.
-    # A 10 block tolerance here should be about 6MB of wiggle room.
-    assert abs(gpu_blocks - expected_blocks) < 10
+    # A 100 block tolerance here should be about 60MB of wiggle room.
+    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0aa367a173b6c..06b8542779dc0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -487,11 +487,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='The fraction of GPU memory to be used for the model '
             'executor, which can range from 0 to 1. For example, a value of '
             '0.5 would imply 50%% GPU memory utilization. If unspecified, '
-            'will use the default value of 0.9. This is a global gpu memory '
-            'utilization limit, for example if 50%% of the gpu memory is '
-            'already used before vLLM starts and --gpu-memory-utilization is '
-            'set to 0.9, then only 40%% of the gpu memory will be allocated '
-            'to the model executor.')
+            'will use the default value of 0.9. This is a per-instance '
+            'limit, and only applies to the current vLLM instance.'
+            'It does not matter if you have another vLLM instance running '
+            'on the same GPU. For example, if you have two vLLM instances '
+            'running on the same GPU, you can set the GPU memory utilization '
+            'to 0.5 for each instance.')
         parser.add_argument(
             '--num-gpu-blocks-override',
             type=int,
diff --git a/vllm/utils.py b/vllm/utils.py
index 45e682ac15782..73d2ae25f15ca 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -23,10 +23,12 @@
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generic, Hashable, List, Literal, Optional,
-                    OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
+                    Dict, Generator, Generic, Hashable, List, Literal,
+                    Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union,
+                    overload)
 from uuid import uuid4
 
 import numpy as np
@@ -1664,3 +1666,122 @@ def kill_process_tree(pid: int):
     # Finally kill the parent
     with contextlib.suppress(ProcessLookupError):
         os.kill(pid, signal.SIGKILL)
+
+
+@dataclass
+class MemorySnapshot:
+    """Memory snapshot."""
+    torch_peak_in_bytes: int = 0
+    torch_memory_in_bytes: int = 0
+    timestamp: float = 0.0
+
+    def measure(self):
+        self.torch_peak_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.peak"]
+        self.torch_memory_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        self.timestamp = time.time()
+
+    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        """support a - b"""
+        return MemorySnapshot(
+            torch_peak_in_bytes=self.torch_peak_in_bytes -
+            other.torch_peak_in_bytes,
+            torch_memory_in_bytes=self.torch_memory_in_bytes -
+            other.torch_memory_in_bytes,
+            timestamp=self.timestamp - other.timestamp)
+
+
+@dataclass
+class MemoryProfilingResult:
+    """Memory profiling result.
+    """  # noqa
+    baseline_memory_in_bytes: int = 0
+    non_kv_cache_memory_in_bytes: int = 0
+    torch_peak_increase_in_bytes: int = 0
+    non_torch_increase_in_bytes: int = 0
+    weights_memory_in_bytes: float = 0
+    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    profile_time: float = 0.0
+
+
+@contextlib.contextmanager
+def memory_profiling(
+    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
+) -> Generator[MemoryProfilingResult, None, None]:
+    """Memory profiling context manager.
+    baseline_memory_in_bytes: memory used by all the components other than
+        the current vLLM instance. It contains: memory used by other processes, memory
+        used by another vLLM instance in the same process, etc. It is usually measured
+        before the current vLLM instance initialize the device. And we assume it is
+        constant during the profiling of the current vLLM instance.
+    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
+        Note that, before loading the model weights, we also initialize the device
+        and distributed environment, which may consume some memory. This part is not
+        included in the weights_memory_in_bytes because PyTorch does not control it.
+
+    The memory in one GPU can be classified into 3 categories:
+    1. memory used by anything other than the current vLLM instance.
+    2. memory used by torch in the current vLLM instance.
+    3. memory used in the current vLLM instance, but not by torch.
+
+    A quantitive example:
+
+    Before creating the current vLLM instance:
+        category 1: 1 GiB
+        category 2: 0 GiB
+        category 3: 0 GiB
+
+    After creating the current vLLM instance and loading the model,
+    (i.e. before profiling):
+        category 1: 1 GiB
+        category 2: 2 GiB (model weights take 2 GiB)
+        category 3: 0.5 GiB (memory used by NCCL)
+
+    During profiling (peak):
+        category 1: 1 GiB
+        category 2: 4 GiB (peak activation tensors take 2 GiB)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    After profiling:
+        category 1: 1 GiB
+        category 2: 3 GiB (after garbage-collecting activation tensors)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    In this case, non-kv cache takes 5 GiB in total, including:
+    a. 2 GiB used by the model weights (category 2)
+    b. 2 GiB reserved for the peak activation tensors (category 2)
+    c. 1 GiB used by non-torch components (category 3)
+
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
+
+    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+
+    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    """ # noqa
+    torch.cuda.reset_peak_memory_stats()
+
+    result = MemoryProfilingResult()
+
+    result.baseline_memory_in_bytes = baseline_memory_in_bytes
+    # the part of memory used for holding the model weights
+    result.weights_memory_in_bytes = weights_memory_in_bytes
+
+    result.before_profile.measure()
+
+    yield result
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    result.after_profile.measure()
+
+    diff = result.after_profile - result.before_profile
+    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
+    current_cuda_memory_bytes = torch.cuda.mem_get_info(
+    )[1] - torch.cuda.mem_get_info()[0]
+    result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
+    result.profile_time = diff.timestamp
+    result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index e08a61e31fe42..18b03bf1bfb56 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -645,7 +645,8 @@ def _advance_step(self, model_input: StatefulModelInput,
         return model_input
 
     def load_model(self) -> None:
-        return self._base_model_runner.load_model()
+        self._base_model_runner.load_model()
+        self.model_memory_usage = self._base_model_runner.model_memory_usage
 
     def save_sharded_state(
         self,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index a368bb9ee9a5b..f51b51d433d3d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,7 +1,6 @@
 """A GPU worker class."""
 import gc
 import os
-import time
 from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
@@ -22,6 +21,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.utils import GiB_bytes, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -192,33 +192,22 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         torch.cuda.reset_peak_memory_stats()
 
         free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
-        start_time = time.time()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
-        torch.cuda.synchronize()
+        with memory_profiling(baseline_memory_in_bytes=total_gpu_memory -
+                              self.init_gpu_memory,
+                              weights_memory_in_bytes=self.model_runner.
+                              model_memory_usage) as result:
+            self.model_runner.profile_run()
+            torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 
-        # Get the peak memory allocation recorded by torch
-        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
-        torch.cuda.empty_cache()
-        torch_allocated_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        total_allocated_bytes = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        if non_torch_allocations > 0:
-            peak_memory += non_torch_allocations
-
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
+        memory_for_current_instance = total_gpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        available_kv_cache_memory = (memory_for_current_instance -
+                                     result.non_kv_cache_memory_in_bytes)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
@@ -233,24 +222,23 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
 
-        end_time = time.time()
-        logger.info(
-            "Memory profiling results: "
-            "duration=%.2f seconds, "
-            "total_gpu_memory=%.2fGiB, "
-            "initial_memory_usage=%.2fGiB, "
-            "peak_torch_memory=%.2fGiB, "
-            "memory_usage_post_profile=%.2fGiB, "
-            "non_torch_memory=%.2fGiB, "
-            "kv_cache_size=%.2fGiB, "
-            "gpu_memory_utilization=%.2f.", end_time - start_time,
-            total_gpu_memory / (1024**3),
-            (total_gpu_memory - free_memory_pre_profile) / (1024**3),
-            (peak_memory - non_torch_allocations) / (1024**3),
-            total_allocated_bytes / (1024**3),
-            non_torch_allocations / (1024**3),
-            available_kv_cache_memory / (1024**3),
-            self.cache_config.gpu_memory_utilization)
+        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
+               "the current vLLM instance can use "
+               "total_gpu_memory "
+               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
+               " x gpu_memory_utilization "
+               f"({self.cache_config.gpu_memory_utilization:.2f})"
+               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
+               "model weights take "
+               f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
+               " non_torch_memory takes "
+               f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " PyTorch activation peak memory takes "
+               f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " the rest of the memory reserved for KV Cache is "
+               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
+
+        logger.info(msg)
 
         # Final cleanup
         if self.model_runner.lora_manager:

From 35ffa682b1cd3f47eb6cda586a16dab5c0401477 Mon Sep 17 00:00:00 2001
From: bk-TurbaAI <babar.khan@turba.ai>
Date: Mon, 16 Dec 2024 23:20:39 +0100
Subject: [PATCH 68/85] [Docs] hint to enable use of GPU performance counters
 in profiling tools for multi-node distributed serving (#11235)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 docs/source/serving/distributed_serving.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 4d57206e53a05..b24ba53e59694 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -54,7 +54,7 @@ Multi-Node Inference and Serving
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster.
+The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 

From c301616ed23fef433db1a49df332b9d61d3178ad Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 15:53:18 -0800
Subject: [PATCH 69/85] [ci][tests] add gh200 tests (#11244)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/run-gh200-test.sh | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 .buildkite/run-gh200-test.sh

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
new file mode 100644
index 0000000000000..d25510c47fe6b
--- /dev/null
+++ b/.buildkite/run-gh200-test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --target test \
+  -platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference.py
+'

From 88a412ed3d964de3443c42a6a35108115ee0ad25 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 16:15:22 -0800
Subject: [PATCH 70/85] [torch.compile] fast inductor (#11108)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/compilation/backends.py | 213 +++++++++++++++++-
 vllm/config.py               | 415 ++++++++++++++++++++++++++++++++++-
 vllm/envs.py                 |   3 +
 3 files changed, 624 insertions(+), 7 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 4a5dc337d01b8..0c7bbfe599b02 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,6 +1,10 @@
+import ast
 import copy
 import dataclasses
+import os
+import pprint
 import time
+from collections import defaultdict
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
 from unittest.mock import patch
@@ -21,6 +25,122 @@
 logger = init_logger(__name__)
 
 
+class InductorHashCache:
+    """
+    Disk format: a Python list of tuples, each tuple is
+    (runtime_shape, graph_index, hash_str)
+    We use list of tuple for readability.
+
+    In-memory format: a defaultdict of dict, where the key is
+    runtime_shape, and the value is a dict of graph_index to hash_str.
+
+    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
+    we don't use json here because json doesn't support int as key.
+
+    TODO: better off-the-shelf solution to serialize the data?
+    """
+
+    def __init__(self, cache_dir: str, disabled: bool = False):
+        self.cache: defaultdict = defaultdict(dict)
+        self.disabled = disabled
+        self.cache_dir = cache_dir
+        self.cache_file_path = os.path.join(cache_dir,
+                                            "inductor_hash_cache.py")
+        if disabled:
+            return
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+        if os.path.exists(self.cache_file_path):
+            with open(self.cache_file_path) as f:
+                self.deserialize(f.read())
+
+    def deserialize(self, data: str):
+        # we use ast.literal_eval to parse the data
+        # because it is a safe way to parse Python literals.
+        # do not use eval(), it is unsafe.
+        list_data = ast.literal_eval(data)
+        for runtime_shape, graph_index, hash_str in list_data:
+            self.cache[runtime_shape][graph_index] = hash_str
+
+    def serialize(self) -> str:
+        data = []
+        for runtime_shape, graph_index_to_hash_str in self.cache.items():
+            for graph_index, hash_str in graph_index_to_hash_str.items():
+                data.append((runtime_shape, graph_index, hash_str))
+        printer = pprint.PrettyPrinter(indent=4)
+        return printer.pformat(data)
+
+    def save_to_file(self):
+        if self.disabled:
+            return
+        with open(self.cache_file_path, "w") as f:
+            f.write(self.serialize())
+
+    def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
+        if self.disabled:
+            return False
+        runtime_shape, graph_index = key
+        return runtime_shape in self.cache and graph_index in self.cache[
+            runtime_shape]
+
+    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
+        if self.disabled:
+            raise KeyError("cannot read from disabled cache")
+        runtime_shape, graph_index = key
+        return self.cache[runtime_shape][graph_index]
+
+    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
+        # setitem for disabled cache is fine, because we
+        # don't actually write to the disk
+        runtime_shape, graph_index = key
+        self.cache[runtime_shape][graph_index] = value
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
+    """
+
+    def __init__(self) -> None:
+        self.guards: List[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
+
+
 def wrap_inductor(graph,
                   example_inputs,
                   additional_inductor_config,
@@ -55,9 +175,93 @@ def wrap_inductor(graph,
     # inductor can inplace modify the graph, so we need to copy it
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
-    compiled_graph = compile_fx(graph,
-                                example_inputs,
-                                config_patches=current_config)
+
+    cache_data = compilation_config.inductor_hash_cache
+    if (runtime_shape, graph_index) in cache_data:
+        # we compiled this graph before
+        # so we can directly lookup the compiled graph via hash
+        hash_str = cache_data[(runtime_shape, graph_index)]
+        if graph_index == 0:
+            # adds some info logging for the first graph
+            logger.info(
+                "Directly lookup the graph for shape %s from the cache",
+                str(runtime_shape))  # noqa
+        logger.debug(
+            "directly lookup the %s-th graph for shape %s via hash %s",
+            graph_index, str(runtime_shape), hash_str)
+        from torch._inductor.codecache import FxGraphCache
+        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+            inductor_compiled_graph = FxGraphCache._lookup_graph(
+                hash_str, example_inputs, True, False)
+            assert inductor_compiled_graph is not None, (
+                "Inductor cache lookup failed. Please remove"
+                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+            )
+
+        # Inductor calling convention (function signature):
+        # f(list) -> tuple
+        # Dynamo calling convention (function signature):
+        # f(*args) -> Any
+
+        # need to know if the graph returns a tuple
+        from torch._inductor.compile_fx import graph_returns_tuple
+        returns_tuple = graph_returns_tuple(graph)
+
+        # this is the graph we return to Dynamo to run
+        def compiled_graph(*args):
+            # convert args to list
+            list_args = list(args)
+            graph_output = inductor_compiled_graph(list_args)
+            # unpack the tuple if needed
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+    else:
+        # it's the first time we compile this graph
+        # the assumption is that we don't have nested Inductor compilation.
+        # compiled_fx_graph_hash will only be called once, and we can hook
+        # it to get the hash of the compiled graph directly.
+        from torch._inductor.codecache import compiled_fx_graph_hash
+
+        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+            out = compiled_fx_graph_hash(*args, **kwargs)
+            # store the hash in the cache
+            nonlocal cache_data
+            cache_data[(runtime_shape, graph_index)] = out[0]
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                logger.info("Cache the graph of shape %s for later use",
+                            str(runtime_shape))
+            logger.debug("store the %s-th graph for shape %s via hash %s",
+                         graph_index, str(runtime_shape), out[0])
+            return out
+
+        def _check_can_cache(*args, **kwargs):
+            # no error means it can be cached.
+            # Inductor refuses to cache the graph outside of Dynamo
+            # tracing context, and also disables caching for graphs
+            # with high-order ops.
+            # For vLLM, in either case, we want to cache the graph.
+            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
+            return
+
+        def _get_shape_env():
+            return AlwaysHitShapeEnv()
+
+        with patch(# for hijacking the hash of the compiled graph
+                "torch._inductor.codecache.compiled_fx_graph_hash",
+                hijack_compiled_fx_graph_hash), \
+            patch(# for providing a dummy shape environment
+                "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                 _get_shape_env), \
+            patch(# for forcing the graph to be cached
+                "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                _check_can_cache):
+            compiled_graph = compile_fx(graph,
+                                        example_inputs,
+                                        config_patches=current_config)
 
     # after compiling the last graph, record the end time
     if graph_index == num_graphs - 1:
@@ -457,6 +661,9 @@ def __call__(self, *args) -> Any:
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
+
+                # save the hash of the inductor graph for the next run
+                self.compilation_config.inductor_hash_cache.save_to_file()
                 end_monitoring_torch_compile(self.vllm_config)
 
         if not entry.use_cudagraph:
diff --git a/vllm/config.py b/vllm/config.py
index fce8011be4015..9cfd08024ea7b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,6 +3,7 @@
 import enum
 import hashlib
 import json
+import os
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -162,6 +163,30 @@ class ModelConfig:
             which allows no processors.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.model)
+        factors.append(self.dtype)
+        factors.append(self.quantization)
+        factors.append(self.quantization_param_path)
+        factors.append(self.revision)
+        factors.append(self.code_revision)
+        factors.append(self.trust_remote_code)
+        factors.append(self.rope_scaling)
+        factors.append(self.rope_theta)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
     def __init__(self,
                  model: str,
                  task: Union[TaskOption, Literal["draft"]],
@@ -203,6 +228,8 @@ def __init__(self,
         self.seed = seed
         self.revision = revision
         self.code_revision = code_revision
+        self.rope_scaling = rope_scaling
+        self.rope_theta = rope_theta
 
         if hf_overrides is None:
             hf_overrides = {}
@@ -832,6 +859,24 @@ class CacheConfig:
         cpu_offload_gb: Size of the CPU offload buffer in GiB.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.cache_dtype)
+        # `cpu_offload_gb` does not use `torch.compile` yet.
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __init__(
         self,
         block_size: int,
@@ -928,6 +973,24 @@ class TokenizerPoolConfig:
     pool_type: Union[str, Type["BaseTokenizerGroup"]]
     extra_config: dict
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if self.pool_type not in ("ray", ) and not isinstance(
                 self.pool_type, type):
@@ -1010,6 +1073,24 @@ class LoadConfig:
         default_factory=dict)
     ignore_patterns: Optional[Union[List[str], str]] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         model_loader_extra_config = self.model_loader_extra_config or {}
         if isinstance(model_loader_extra_config, str):
@@ -1073,6 +1154,19 @@ class ParallelConfig:
 
     rank: int = 0
 
+    def compute_hash(self):
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.pipeline_parallel_size)
+        factors.append(self.tensor_parallel_size)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
     def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
@@ -1209,6 +1303,24 @@ class SchedulerConfig:
 
     chunked_prefill_enabled: bool = field(init=False)
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self) -> None:
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
@@ -1286,6 +1398,25 @@ class DeviceConfig:
     device: Optional[torch.device]
     device_type: str
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # the device/platform information will be summarized
+        # by torch/vllm automatically.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
@@ -1313,6 +1444,24 @@ class SpeculativeConfig:
     decoding with top-1 proposals.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # spec decode does not use `torch.compile` yet.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def maybe_create_spec_config(
         target_model_config: ModelConfig,
@@ -1753,6 +1902,24 @@ class LoRAConfig:
     long_lora_scaling_factors: Optional[Tuple[float]] = None
     bias_enabled: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # LoRA is not compatible with `torch.compile` .
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         # Setting the maximum rank to 256 should be able to satisfy the vast
         # majority of applications.
@@ -1802,6 +1969,24 @@ class PromptAdapterConfig:
     max_cpu_prompt_adapters: Optional[int] = None
     prompt_adapter_dtype: Optional[torch.dtype] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
 
         if self.max_prompt_adapters < 1:
@@ -1830,6 +2015,24 @@ class MultiModalConfig:
     for each :class:`~vllm.multimodal.MultiModalPlugin`.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     # TODO: Add configs to init vision tower or not.
 
 
@@ -1869,6 +2072,24 @@ class PoolerConfig:
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def from_json(json_str: str) -> "PoolerConfig":
         return PoolerConfig(**json.loads(json_str))
@@ -2103,6 +2324,24 @@ class DecodingConfig:
     # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
     guided_decoding_backend: str = 'xgrammar'
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
         backend = self.guided_decoding_backend
@@ -2124,6 +2363,24 @@ class ObservabilityConfig:
     # If set, collects the model execute time for the request.
     collect_model_execute_time: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if not is_otel_available() and self.otlp_traces_endpoint is not None:
             raise ValueError(
@@ -2165,6 +2422,24 @@ class KVTransferConfig(BaseModel):
     # The KV connector port, used to build distributed connection
     kv_port: int = 14579
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @classmethod
     def from_cli(cls, cli_value: str) -> "KVTransferConfig":
         """Parse the CLI value for the kv cache transfer config."""
@@ -2234,6 +2509,9 @@ class CompilationConfig(BaseModel):
             - 2: dynamo once.
             - 3: piecewise compilation.
         - debug_dump_path: the path to dump the debug information.
+        - cache_dir: the directory to store the compiled graph, to
+            accelerate Inductor compilation. By default, it will use
+            model-related information to generate a cache directory.
         - backend: the backend for compilation. It needs to be a string.
             - "" (empty string): use the default backend.
             - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
@@ -2302,12 +2580,10 @@ class CompilationConfig(BaseModel):
     """ # noqa
     level: int = 0
     debug_dump_path: str = ""
+    cache_dir: str = ""
     backend: str = ""
     custom_ops: List[str] = Field(default_factory=list)
-    splitting_ops: List[str] = Field(default_factory=lambda: [
-        "vllm.unified_attention",
-        "vllm.unified_attention_with_output",
-    ])
+    splitting_ops: List[str] = Field(default=None)  # type: ignore
 
     use_inductor: bool = True
     candidate_compile_sizes: Optional[List[int]] = Field(default=None)
@@ -2371,12 +2647,37 @@ def model_post_init(self, __context: Any) -> None:
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
     compilation_time: float = PrivateAttr
+    # should be InductorHashCache, but Pydantic does not support it
+    inductor_hash_cache: Any = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.level)
+        factors.append(self.backend)
+        factors.append(self.custom_ops)
+        factors.append(self.splitting_ops)
+        factors.append(self.use_inductor)
+        factors.append(self.inductor_compile_config)
+        factors.append(self.inductor_passes)
+        factors.append(self.pass_config.uuid())
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
     def __repr__(self) -> str:
         exclude = {
             "static_forward_context",
@@ -2405,6 +2706,27 @@ def model_post_init(self, __context: Any) -> None:
         count_all = self.custom_ops.count("all")
         assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
 
+        if self.splitting_ops is None:
+            if envs.VLLM_USE_V1:
+                # v1 must split the graph on attention ops
+                # for piecewise cudagraph
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+            else:
+                # v0 can use full graph compilation without splitting,
+                # splitting is optional.
+                # right now we still need it. kv cache shape
+                # will be included in the graph if we don't split
+                # the graph.
+                # TODO: hide kv cache in static forward context
+                # so that inductor does not see it.
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):
                 assert callable(v), (
@@ -2444,6 +2766,30 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         # TODO: pass user-specified backend to piecewise compilation
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
+
+        if not self.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+            hash_key = vllm_config.compute_hash()
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+            os.makedirs(cache_dir, exist_ok=True)
+            self.cache_dir = cache_dir
+
+            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+            from vllm.compilation.backends import InductorHashCache
+            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+                self.cache_dir, disabled=disabled)
+            if disabled:
+                logger.info("vLLM's torch.compile cache is disabled.")
+            else:
+                logger.info(
+                    "Using cache directory: %s for vLLM's torch.compile",
+                    self.cache_dir)
+
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(vllm_config)
 
@@ -2520,6 +2866,67 @@ class VllmConfig:
                                                  init=True)  # type: ignore
     instance_id: str = ""
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        # summarize system state
+        from torch._inductor.codecache import CacheBase
+        system_factors = CacheBase.get_system()
+        factors.append(system_factors)
+
+        # summarize pytorch state
+        from torch._inductor.codecache import torch_key
+        torch_factors = torch_key()
+        factors.append(torch_factors)
+
+        # summarize vllm config
+        vllm_factors: List[Any] = []
+        from vllm import __version__
+        vllm_factors.append(__version__)
+        if self.model_config:
+            vllm_factors.append(self.model_config.compute_hash())
+        if self.cache_config:
+            vllm_factors.append(self.cache_config.compute_hash())
+        if self.parallel_config:
+            vllm_factors.append(self.parallel_config.compute_hash())
+        if self.scheduler_config:
+            vllm_factors.append(self.scheduler_config.compute_hash())
+        if self.device_config:
+            vllm_factors.append(self.device_config.compute_hash())
+        if self.load_config:
+            vllm_factors.append(self.load_config.compute_hash())
+        if self.lora_config:
+            vllm_factors.append(self.lora_config.compute_hash())
+        if self.speculative_config:
+            vllm_factors.append(self.speculative_config.compute_hash())
+        if self.decoding_config:
+            vllm_factors.append(self.decoding_config.compute_hash())
+        if self.observability_config:
+            vllm_factors.append(self.observability_config.compute_hash())
+        if self.prompt_adapter_config:
+            vllm_factors.append(self.prompt_adapter_config.compute_hash())
+        if self.quant_config:
+            pass  # should be captured by model_config.quantization
+        if self.compilation_config:
+            vllm_factors.append(self.compilation_config.compute_hash())
+        if self.kv_transfer_config:
+            vllm_factors.append(self.kv_transfer_config.compute_hash())
+
+        factors.append(vllm_factors)
+
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        return hash_str
+
     def pad_for_cudagraph(self, batch_size: int) -> int:
         # if batch_size > self.compilation_config.max_capture_size,
         # it should raise an IndexError.
diff --git a/vllm/envs.py b/vllm/envs.py
index da17b747ea215..18870c1c6b51a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -71,6 +71,7 @@
     VLLM_USE_V1: bool = False
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
+    VLLM_DISABLE_COMPILE_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -463,6 +464,8 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
     "VLLM_LOG_BATCHSIZE_INTERVAL":
     lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
+    "VLLM_DISABLE_COMPILE_CACHE":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
 }
 
 # end-env-vars-definition

From 35bae114a89e03e3dc6a6d2f758378e58938bffa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 17:22:38 -0800
Subject: [PATCH 71/85] fix gh200 tests on main (#11246)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/run-gh200-test.sh                  | 4 ++--
 docs/source/serving/deploying_with_docker.rst | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index d25510c47fe6b..d06604f96f2b8 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -6,8 +6,8 @@ set -ex
 
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
-  --target test \
-  -platform "linux/arm64" \
+  --target vllm-openai \
+  --platform "linux/arm64" \
   -t gh200-test \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 11a9f12fd17cd..56f0020a1011a 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -54,16 +54,13 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \
-      -platform "linux/arm64" \
+      --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \
       --build-arg max_jobs=66 \
       --build-arg nvcc_threads=2 \
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
-
-
-
 To run vLLM:
 
 .. code-block:: console

From 0064f697d318a2ce38342f7c20754cf229311b8b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 16 Dec 2024 22:39:58 -0500
Subject: [PATCH 72/85] [CI] Add test case with JSON schema using references +
 use xgrammar by default with OpenAI parse (#10935)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/conftest.py                 | 39 +++++++++++++++++++
 tests/entrypoints/llm/test_guided_generate.py | 28 +++++++++++++
 vllm/entrypoints/openai/protocol.py           |  2 +-
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 0f7d15e1d85aa..ef74062ce4b41 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -100,6 +100,45 @@ def sample_complex_json_schema():
     }
 
 
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        '$defs': {
+            'Step': {
+                'properties': {
+                    'explanation': {
+                        'title': 'Explanation',
+                        'type': 'string'
+                    },
+                    'output': {
+                        'title': 'Output',
+                        'type': 'string'
+                    }
+                },
+                'required': ['explanation', 'output'],
+                'title': 'Step',
+                'type': 'object'
+            }
+        },
+        'properties': {
+            'steps': {
+                'items': {
+                    '$ref': '#/$defs/Step'
+                },
+                'title': 'Steps',
+                'type': 'array'
+            },
+            'final_answer': {
+                'title': 'Final Answer',
+                'type': 'string'
+            }
+        },
+        'required': ['steps', 'final_answer'],
+        'title': 'MathReasoning',
+        'type': 'object'
+    }
+
+
 @pytest.fixture
 def sample_guided_choice():
     return [
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index de6257cfc551c..ed50ec6bbc9eb 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -104,6 +104,34 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm):
                             schema=sample_complex_json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_definition_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for solving 8x + 7 = -23 "
+        f"that fits this schema: {sample_definition_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_definition_json_schema)
+
+
 @pytest.mark.skip_global_cleanup
 def test_guided_choice_completion(sample_guided_choice, llm):
     sampling_params = SamplingParams(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6ed7c2e9dcd6b..5a70e0952666b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -387,7 +387,7 @@ def to_sampling_params(
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
                 if self.guided_decoding_backend is None:
-                    self.guided_decoding_backend = "lm-format-enforcer"
+                    self.guided_decoding_backend = "xgrammar"
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,

From 66d4b16724226e9f377551198cc7425c12ddafae Mon Sep 17 00:00:00 2001
From: kYLe <kylhuang@nvidia.com>
Date: Tue, 17 Dec 2024 00:09:58 -0600
Subject: [PATCH 73/85] [Frontend] Add OpenAI API support for input_audio
 (#11027)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../serving/openai_compatible_server.md       |  10 +-
 docs/source/usage/multimodal_inputs.rst       |  90 ++++++++++++-
 ...i_chat_completion_client_for_multimodal.py |  34 ++++-
 tests/entrypoints/openai/test_audio.py        | 125 +++++++++++++++++-
 vllm/entrypoints/chat_utils.py                |  65 +++++++--
 5 files changed, 301 insertions(+), 23 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 14a5b02d72aa5..1bc8d32d2d161 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -34,11 +34,6 @@ We currently support the following OpenAI APIs:
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
   - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
-    - *Note: `image_url.detail` parameter is not supported.*
-  - We also support `audio_url` content type for audio files.
-    - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
-    - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
   - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
@@ -209,6 +204,11 @@ The following extra parameters are supported:
 
 Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
 
+We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
+[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
+see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+- *Note: `image_url.detail` parameter is not supported.*
+
 #### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index 1e00f26f9a3ba..680382e457cc5 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -315,7 +315,95 @@ You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/e
 Audio
 ^^^^^
 
-Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`.
+Audio input is supported according to `OpenAI Audio API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
+Here is a simple example using Ultravox-v0.3.
+
+First, launch the OpenAI-compatible server:
+
+.. code-block:: bash
+
+    vllm serve fixie-ai/ultravox-v0_3
+    
+Then, you can use the OpenAI client as follows:
+
+.. code-block:: python
+
+    import base64
+    import requests
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset
+
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""
+
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')
+
+        return result
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Any format supported by librosa is supported
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
+
+.. code-block:: python
+
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
 
 A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
 
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 0ec4f71dddf93..6a160fd70423f 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -153,10 +153,37 @@ def run_multi_image() -> None:
 
 # Audio input inference
 def run_audio() -> None:
-    # Any format supported by librosa is supported
     audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
 
-    # Use audio url in the payload
+    # HTTP URL
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role":
@@ -169,6 +196,7 @@ def run_audio() -> None:
                 {
                     "type": "audio_url",
                     "audio_url": {
+                        # Any format supported by librosa is supported
                         "url": audio_url
                     },
                 },
@@ -181,7 +209,7 @@ def run_audio() -> None:
     result = chat_completion_from_url.choices[0].message.content
     print("Chat completion output from audio url:", result)
 
-    audio_base64 = encode_base64_content_from_url(audio_url)
+    # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index b579dcbb5c402..0a29d77e73abc 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -155,6 +155,61 @@ async def test_single_chat_session_audio_base64encoded(
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_input_audio(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@@ -212,11 +267,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
+                                          model_name: str, audio_url: str,
+                                          base64_encoded_audio: Dict[str,
+                                                                     str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str):
+                                 audio_url: str,
+                                 base64_encoded_audio: Dict[str, str]):
 
     messages = [{
         "role":
@@ -229,9 +345,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                 }
             },
             {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
                 }
             },
             {
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aaa5cd759366a..3df08c740d65b 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -13,7 +13,8 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
-                               ChatCompletionContentPartImageParam)
+                               ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartInputAudioParam)
 from openai.types.chat import (
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
 from openai.types.chat import (ChatCompletionContentPartRefusalParam,
@@ -105,6 +106,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
 
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -519,6 +521,10 @@ def parse_image(self, image_url: str) -> None:
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str) -> None:
         raise NotImplementedError
@@ -545,6 +551,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio = get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = get_and_parse_video(video_url)
 
@@ -574,6 +589,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio_coro = async_get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio_coro)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = async_get_and_parse_video(video_url)
 
@@ -667,17 +691,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
+_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
+MM_PARSER_MAP: Dict[str,
+                    Callable[[ChatCompletionContentPartParam],
+                             Union[str, Dict[str,str]]]] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    "input_audio":
+    lambda part: _InputAudioParser(part).get("input_audio", {}),
     "refusal":
     lambda part: _RefusalParser(part).get("refusal", ""),
     "video_url":
@@ -686,7 +715,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str, str]:
+        part: ChatCompletionContentPartParam) -> Tuple[str,
+                                                Union[str, Dict[str, str]]]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -717,6 +747,7 @@ def _parse_chat_message_content_mm_part(
         return part_type, content
 
     # Handle missing 'type' but provided direct URL fields.
+    # 'type' is required field by pydantic
     if part_type is None:
         if part.get("image_url") is not None:
             image_params = cast(CustomChatCompletionContentSimpleImageParam,
@@ -726,6 +757,9 @@ def _parse_chat_message_content_mm_part(
             audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
                                 part)
             return "audio_url", audio_params.get("audio_url", "")
+        if part.get("input_audio") is not None:
+            input_audio_params = cast(Dict[str, str], part)
+            return "input_audio", input_audio_params
         if part.get("video_url") is not None:
             video_params = cast(CustomChatCompletionContentSimpleVideoParam,
                                 part)
@@ -739,7 +773,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "audio_url", "video_url")
+                                       "audio_url", "input_audio", "video_url")
 
 
 def _parse_chat_message_content_parts(
@@ -795,7 +829,7 @@ def _parse_chat_message_content_part(
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
 
-    # if part_type is text/refusal/image_url/audio_url/video_url but
+    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
     # content is empty, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
         logger.warning(
@@ -804,18 +838,30 @@ def _parse_chat_message_content_part(
         return None
 
     if part_type in ("text", "refusal"):
-        return {'type': 'text', 'text': content} if wrap_dicts else content
+        str_content = cast(str, content)
+        if wrap_dicts:
+            return {'type': 'text', 'text': str_content}
+        else:
+            return str_content
 
     if part_type == "image_url":
-        mm_parser.parse_image(content)
+        str_content = cast(str, content)
+        mm_parser.parse_image(str_content)
         return {'type': 'image'} if wrap_dicts else None
 
     if part_type == "audio_url":
-        mm_parser.parse_audio(content)
+        str_content = cast(str, content)
+        mm_parser.parse_audio(str_content)
+        return {'type': 'audio'} if wrap_dicts else None
+
+    if part_type == "input_audio":
+        dict_content = cast(Dict[str, str], content)
+        mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "video_url":
-        mm_parser.parse_video(content)
+        str_content = cast(str, content)
+        mm_parser.parse_video(str_content)
         return {'type': 'video'} if wrap_dicts else None
 
     raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -840,7 +886,6 @@ def _parse_chat_message_content(
         content = [
             ChatCompletionContentPartTextParam(type="text", text=content)
         ]
-
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore

From 59c9b6ebeba79b2d744eec86734a7e13b03dcab7 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 16 Dec 2024 22:10:57 -0800
Subject: [PATCH 74/85] [V1][VLM] Proper memory profiling for image language
 models (#11210)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: ywang96 <ywang@example.com>
---
 vllm/config.py                        |  8 ++++
 vllm/model_executor/models/pixtral.py |  5 ++
 vllm/multimodal/registry.py           | 23 +++++++--
 vllm/v1/core/scheduler.py             |  7 ++-
 vllm/v1/engine/mm_input_mapper.py     |  1 +
 vllm/v1/worker/gpu_model_runner.py    | 67 ++++++++++++++++++++++++---
 6 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 9cfd08024ea7b..9ecd3e72afa9f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1280,6 +1280,14 @@ class SchedulerConfig:
 
     is_multimodal_model: bool = False
 
+    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
+    # calculate the actual values from the configurations.
+    # Multimodal encoder run compute budget, only used in V1
+    max_num_encoder_input_tokens = 16384
+
+    # Multimodal encoder cache size, only used in V1
+    encoder_cache_size = 16384
+
     # Whether to perform preemption by swapping or
     # recomputation. If not specified, we determine the mode as follows:
     # We use recomputation by default since it incurs lower overhead than
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 161d6b41bfa5f..f05ea195e043d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -245,6 +245,11 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
 
+        # If the last split index is the last index in image_tokens, we
+        # ignore it to avoid empty split tensor
+        if split_indices[-1] == len(image_tokens):
+            split_indices = split_indices[:-1]
+
         image_embeds = image_embeds.tensor_split(split_indices.cpu())
         return image_embeds
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 03f8814a95356..6cd79d414c978 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -200,6 +200,23 @@ def register_max_image_tokens(
         """
         return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
+    def get_max_tokens_per_item_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality
+        for profiling the memory usage of a model.
+
+        Note:
+            This is currently directly used only in V1.
+        """
+
+        return {
+            key: plugin.get_max_multimodal_tokens(model_config)
+            for key, plugin in self._plugins.items()
+        }
+
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
@@ -216,9 +233,9 @@ def get_max_tokens_by_modality(
         limits_per_plugin = self._limits_by_model[model_config]
 
         return {
-            key: (limits_per_plugin[key] *
-                  plugin.get_max_multimodal_tokens(model_config))
-            for key, plugin in self._plugins.items()
+            key: limits_per_plugin[key] * max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
         }
 
     def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f76364f64033d..178532e477dae 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -73,14 +73,13 @@ def __init__(
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
         # projector if needed). Currently, we assume that the encoder also
         # has the Transformer architecture (e.g., ViT).
-        # FIXME(woosuk): Below are placeholder values. We need to calculate the
-        # actual values from the configurations.
-        self.max_num_encoder_input_tokens = 16384
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
         # NOTE(woosuk): For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized and used, regardless of
         # the cache size. This is because the memory space for the encoder cache
         # is preallocated in the profiling run.
-        self.encoder_cache_manager = EncoderCacheManager(cache_size=16384)
+        self.encoder_cache_manager = EncoderCacheManager(
+            cache_size=self.scheduler_config.encoder_cache_size)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index cca27c2218af7..6cdeba6f3f71e 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -54,6 +54,7 @@ def cache_hit_ratio(self, steps):
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
                          self.mm_cache_hits / self.mm_cache_total)
 
+    # TODO: Support modalities beyond image.
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 67166fb05085c..c6fab5f05fcb3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -10,15 +10,16 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -35,7 +36,6 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         device: torch.device,
-        input_registry: InputRegistry = INPUT_REGISTRY,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -77,7 +77,12 @@ def __init__(
         self.hidden_size = model_config.get_hidden_size()
 
         # Multi-modal data support
-        self.input_registry = input_registry
+        self.input_registry = INPUT_REGISTRY
+        self.mm_registry = MULTIMODAL_REGISTRY
+        # NOTE: mm_input_mapper is only used for memory profiling.
+        self.mm_input_mapper = MMInputMapperClient(self.model_config)
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
+        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
@@ -599,8 +604,6 @@ def _dummy_run(
         return hidden_states
 
     def profile_run(self) -> None:
-        # TODO(woosuk): Profile the max memory usage of the encoder and
-        # the encoder cache.
         # use an empty tensor instead of `None`` to force Dynamo to pass
         # it by reference, rather by specializing on the value `None`.
         # the `dtype` argument does not matter, and we use `float32` as
@@ -612,6 +615,57 @@ def profile_run(self) -> None:
             torch.tensor([], dtype=torch.float32, device=self.device)
             for _ in range(self.num_attn_layers)
         ]
+
+        # Profile with multimodal encoder & encoder cache.
+        # TODO (ywang96): generalize this beyond image modality since
+        # mm_input_mapper only supports image inputs.
+        if self.is_multimodal_model:
+
+            # Create dummy batch of multimodal inputs.
+            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+                model_config=self.model_config,
+                seq_len=self.max_num_tokens,
+                mm_registry=self.mm_registry,
+            )
+            dummy_mm_data = dummy_request_data.multi_modal_data
+            dummy_mm_kwargs, _ = self.mm_input_mapper.process_inputs(
+                mm_data=dummy_mm_data,
+                mm_hashes=None,
+                mm_processor_kwargs=None,
+                precomputed_mm_inputs=None)
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality even when it supports multiple.
+            max_tokens_per_mm_item = max(
+                self.mm_registry.get_max_tokens_per_item_by_modality(
+                    self.model_config).values())
+
+            max_num_mm_items = min(
+                self.max_num_encoder_input_tokens,
+                self.encoder_cache_size) // max_tokens_per_mm_item
+
+            # Dummy data definition in V0 may contain multiple multimodal items
+            # (e.g, multiple images) for a single request, therefore here we
+            # always replicate first item by max_num_mm_items times since in V1
+            # they are scheduled to be processed separately.
+            batched_dummy_mm_inputs = MultiModalKwargs.batch(
+                [dummy_mm_kwargs[0]] * max_num_mm_items)
+            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_dummy_mm_inputs, device=self.device)
+
+            # Run multimodal encoder.
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
         # Trigger compilation for general shape.
         hidden_states = self._dummy_run(self.model, self.max_num_tokens,
                                         dummy_kv_caches)
@@ -620,6 +674,7 @@ def profile_run(self) -> None:
         # TODO(woosuk): Consider the memory usage of the sampler.
         torch.cuda.synchronize()
         del hidden_states, logits
+        self.encoder_cache.clear()
         gc.collect()
 
     def capture_model(self) -> None:

From e88db68cf5712956f36e77c288699592327b15bd Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 17 Dec 2024 14:11:06 +0800
Subject: [PATCH 75/85] [Platform] platform agnostic for EngineArgs
 initialization (#11225)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/engine/arg_utils.py   | 8 ++------
 vllm/platforms/cpu.py      | 3 +++
 vllm/platforms/cuda.py     | 4 ++++
 vllm/platforms/hpu.py      | 6 ++++++
 vllm/platforms/neuron.py   | 6 ++++++
 vllm/platforms/openvino.py | 3 +++
 vllm/platforms/rocm.py     | 4 ++++
 vllm/platforms/tpu.py      | 5 +++++
 vllm/platforms/xpu.py      | 4 ++++
 9 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 06b8542779dc0..f6d276fe7c0c8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -112,9 +112,7 @@ class EngineArgs:
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
-    # NOTE(kzawora): default block size for Gaudi should be 128
-    # smaller sizes still work, but very inefficiently
-    block_size: int = 16 if not current_platform.is_hpu() else 128
+    block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
@@ -1036,9 +1034,7 @@ def create_engine_config(self,
             self.enable_prefix_caching = False
 
         cache_config = CacheConfig(
-            # neuron needs block_size = max_model_len
-            block_size=self.block_size if self.device != "neuron" else
-            (self.max_model_len if self.max_model_len is not None else 0),
+            block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             swap_space=self.swap_space,
             cache_dtype=self.kv_cache_dtype,
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index aad8755d9fcd8..d95a2b4cd5565 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -60,6 +60,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         cache_config = vllm_config.cache_config
 
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
 
         if kv_cache_space >= 0:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ae1fd6d5ce068..3c5350b778345 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -137,6 +137,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 2b947d280f9f8..0a44f2b74163a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -48,6 +48,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
 
+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 86113523385f6..a4bbbd27c8a89 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -33,6 +33,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
 
+        cache_config = vllm_config.cache_config
+        if cache_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len
+
     @classmethod
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on Neuron.")
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index ccd94e8adb3b1..16eb8dc81efc2 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -87,6 +87,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # check and update cache config
         ov_core = ov.Core()
         cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
             if not OpenVinoPlatform.is_openvino_cpu():
                 logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0133f26a0b1bc..7778b565372cb 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -84,6 +84,10 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 10d874349f36b..77f5c8401424b 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -46,6 +46,11 @@ def inference_mode(cls):
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         compilation_config = vllm_config.compilation_config
         if compilation_config.level == CompilationLevel.NO_COMPILATION:
             # TPU does not support NO_COMPILATION
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c20190e789d7e..78e17c2afec65 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -51,6 +51,10 @@ def inference_mode():
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         # check and update model config
         model_config = vllm_config.model_config
         if model_config.dtype == torch.bfloat16:

From 2bfdbf2a36256bb08547cea3d4ef83b5d27c4b04 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 17 Dec 2024 01:11:33 -0500
Subject: [PATCH 76/85] [V1][Core] Use weakref.finalize instead of atexit
 (#11242)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core_client.py          | 13 ++-----------
 vllm/v1/executor/multiproc_executor.py | 10 +++-------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index ff25a9b2e9cac..d56fcbdb1e7c4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,5 +1,5 @@
-import atexit
 import os
+import weakref
 from typing import List, Optional
 
 import msgspec
@@ -165,15 +165,9 @@ def __init__(
             ready_path=ready_path,  # type: ignore[misc]
             **kwargs,
         )
-        atexit.register(self.shutdown)
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
     def shutdown(self):
-        # During final garbage collection in process shutdown, atexit may be
-        # None.
-        if atexit:
-            # in case shutdown gets called via __del__ first
-            atexit.unregister(self.shutdown)
-
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
@@ -197,9 +191,6 @@ def shutdown(self):
                     os.remove(socket_file)
             self.proc_handle = None
 
-    def __del__(self):
-        self.shutdown()
-
 
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 17441dacdc5cf..128101aa6956d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,9 +1,9 @@
-import atexit
 import os
 import pickle
 import signal
 import sys
 import time
+import weakref
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing.process import BaseProcess
@@ -37,7 +37,7 @@ class MultiprocExecutor(Executor):
     def __init__(self, vllm_config: VllmConfig) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
-        atexit.register(self.shutdown)
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
         self.vllm_config = vllm_config
         self.parallel_config = vllm_config.parallel_config
@@ -195,14 +195,10 @@ def _cleanup_sockets(self):
                 os.remove(socket_path)
 
     def shutdown(self):
-        if atexit:
-            # in case shutdown was called explicitly, we don't need to call it
-            # again
-            atexit.unregister(self.shutdown)
         """Properly shut down the executor and its workers"""
         if getattr(self, 'shutting_down', False):
             self.shutting_down = True
-            for w in self.workers:  #TODO: not sure if needed
+            for w in self.workers:
                 w.worker_response_mq = None
             self._ensure_worker_termination()
 

From 02222a0256f60319f5bcd56d1d036a943d6334f8 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 16 Dec 2024 22:57:02 -0800
Subject: [PATCH 77/85] [Misc] Kernel Benchmark for `RMSNorm` (#11241)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiaoyu Zhang <BBuf@users.noreply.github.com>
---
 benchmarks/kernels/benchmark_rmsnorm.py | 262 ++++++++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 benchmarks/kernels/benchmark_rmsnorm.py

diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
new file mode 100644
index 0000000000000..baa5de0fff1bd
--- /dev/null
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -0,0 +1,262 @@
+import itertools
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+
+from vllm import _custom_ops as vllm_ops
+
+
+class HuggingFaceRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    output = naive_norm(x, residual)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size,
+                    seq_len,
+                    hidden_size,
+                    dtype=dtype,
+                    device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+
+    output_naive = rmsnorm_naive(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"VLLM output={output_vllm}")
+
+    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
+                      rtol=1e-2) and torch.allclose(
+                          output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(
+    itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=
+            f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+            args={},
+        ))
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+
+        x = torch.randn(batch_size,
+                        seq_len,
+                        hidden_size,
+                        dtype=dtype,
+                        device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=4096,
+        help="Hidden size (2nd dimension) of the sequence",
+    )
+    parser.add_argument("--use-residual",
+                        action="store_true",
+                        help="Whether to use residual connection")
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=args.batch_size,
+                   seq_len=args.seq_len,
+                   hidden_size=args.hidden_size,
+                   use_residual=args.use_residual)
+
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)

From f9ecbb18bf03338a4272c933a49a87021363b048 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 17 Dec 2024 16:37:04 +0800
Subject: [PATCH 78/85] [Misc] Allow passing logits_soft_cap for xformers
 backend (#11252)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/backends/xformers.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index e2e989efb020c..3e59b3603d2c6 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,9 +17,7 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from vllm.utils import print_warning_once
 
 
 class XFormersBackend(AttentionBackend):
@@ -386,8 +384,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError(
-                "XFormers does not support attention logits soft capping.")
+            print_warning_once("XFormers does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)

From 2d1b9baa8f57fc59912c7bcd07fd630fb9d72c9d Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 17 Dec 2024 13:26:32 -0700
Subject: [PATCH 79/85] [Bugfix] Fix request cancellation without polling
 (#11190)

---
 tests/entrypoints/openai/test_basic.py        | 51 ++++++++++++++++
 tests/test_utils.py                           |  6 +-
 tests/utils.py                                | 11 ++--
 vllm/engine/async_llm_engine.py               | 46 +++++++++------
 vllm/entrypoints/api_server.py                | 11 ++--
 vllm/entrypoints/openai/api_server.py         |  8 +++
 vllm/entrypoints/openai/serving_chat.py       |  5 --
 vllm/entrypoints/openai/serving_completion.py |  3 +-
 vllm/entrypoints/openai/serving_embedding.py  |  5 +-
 vllm/entrypoints/openai/serving_score.py      |  5 +-
 vllm/entrypoints/utils.py                     | 57 ++++++++++++++++++
 vllm/utils.py                                 | 59 ++-----------------
 12 files changed, 164 insertions(+), 103 deletions(-)
 create mode 100644 vllm/entrypoints/utils.py

diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 4616f363cc04a..547c1fd020928 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,6 +1,8 @@
+import asyncio
 from http import HTTPStatus
 from typing import List
 
+import openai
 import pytest
 import pytest_asyncio
 import requests
@@ -103,3 +105,52 @@ async def test_check_health(server: RemoteOpenAIServer):
     response = requests.get(server.url_for("health"))
 
     assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--max-model-len", "10100"],
+                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
+            id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_request_cancellation(server: RemoteOpenAIServer):
+    # clunky test: send an ungodly amount of load in with short timeouts
+    # then ensure that it still responds quickly afterwards
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client(timeout=0.5)
+    tasks = []
+    # Request about 2 million tokens
+    for _ in range(200):
+        task = asyncio.create_task(
+            client.chat.completions.create(messages=chat_input,
+                                           model=MODEL_NAME,
+                                           max_tokens=10000,
+                                           extra_body={"min_tokens": 10000}))
+        tasks.append(task)
+
+    done, pending = await asyncio.wait(tasks,
+                                       return_when=asyncio.ALL_COMPLETED)
+
+    # Make sure all requests were sent to the server and timed out
+    # (We don't want to hide other errors like 400s that would invalidate this
+    # test)
+    assert len(pending) == 0
+    for d in done:
+        with pytest.raises(openai.APITimeoutError):
+            d.result()
+
+    # If the server had not cancelled all the other requests, then it would not
+    # be able to respond to this one within the timeout
+    client = server.get_async_client(timeout=5)
+    response = await client.chat.completions.create(messages=chat_input,
+                                                    model=MODEL_NAME,
+                                                    max_tokens=10)
+
+    assert len(response.choices) == 1
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0bc9e5bc32a46..32a6b0aed66aa 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,6 @@
 import asyncio
 import os
 import socket
-from functools import partial
 from typing import AsyncIterator, Tuple
 
 import pytest
@@ -26,10 +25,7 @@ async def mock_async_iterator(idx: int):
             print(f"iterator {idx} cancelled")
 
     iterators = [mock_async_iterator(i) for i in range(3)]
-    merged_iterator = merge_async_iterators(*iterators,
-                                            is_cancelled=partial(asyncio.sleep,
-                                                                 0,
-                                                                 result=False))
+    merged_iterator = merge_async_iterators(*iterators)
 
     async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
         async for idx, output in generator:
diff --git a/tests/utils.py b/tests/utils.py
index afeb708f3bcdc..bf3d88194e4ca 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -163,12 +163,11 @@ def get_client(self):
             api_key=self.DUMMY_API_KEY,
         )
 
-    def get_async_client(self):
-        return openai.AsyncOpenAI(
-            base_url=self.url_for("v1"),
-            api_key=self.DUMMY_API_KEY,
-            max_retries=0,
-        )
+    def get_async_client(self, **kwargs):
+        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
+                                  api_key=self.DUMMY_API_KEY,
+                                  max_retries=0,
+                                  **kwargs)
 
 
 def _test_completion(
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 32396fd10188d..f50e20cf70323 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1065,16 +1065,20 @@ async def generate(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                sampling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                prompt_adapter_request=prompt_adapter_request,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, RequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    sampling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    prompt_adapter_request=prompt_adapter_request,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, RequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def encode(
         self,
@@ -1147,15 +1151,19 @@ async def encode(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                pooling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    pooling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def abort(self, request_id: str) -> None:
         """Abort a request.
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index ea3c93f733038..95da1c6e7b9bf 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -17,11 +17,11 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (FlexibleArgumentParser, iterate_with_cancellation,
-                        random_uuid)
+from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -47,6 +47,11 @@ async def generate(request: Request) -> Response:
     - other fields: the sampling parameters (See `SamplingParams` for details).
     """
     request_dict = await request.json()
+    return await _generate(request_dict, raw_request=request)
+
+
+@with_cancellation
+async def _generate(request_dict: dict, raw_request: Request) -> Response:
     prompt = request_dict.pop("prompt")
     stream = request_dict.pop("stream", False)
     sampling_params = SamplingParams(**request_dict)
@@ -54,8 +59,6 @@ async def generate(request: Request) -> Response:
 
     assert engine is not None
     results_generator = engine.generate(prompt, sampling_params, request_id)
-    results_generator = iterate_with_cancellation(
-        results_generator, is_cancelled=request.is_disconnected)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 14e3a34ce141c..00e2d1a56f160 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -59,6 +59,7 @@
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
@@ -311,6 +312,7 @@ async def health(raw_request: Request) -> Response:
 
 
 @router.post("/tokenize")
+@with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
@@ -325,6 +327,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
 
 @router.post("/detokenize")
+@with_cancellation
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
@@ -353,6 +356,7 @@ async def show_version():
 
 
 @router.post("/v1/chat/completions")
+@with_cancellation
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
     handler = chat(raw_request)
@@ -373,6 +377,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 
 @router.post("/v1/completions")
+@with_cancellation
 async def create_completion(request: CompletionRequest, raw_request: Request):
     handler = completion(raw_request)
     if handler is None:
@@ -390,6 +395,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
 
 @router.post("/v1/embeddings")
+@with_cancellation
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
@@ -407,6 +413,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
 
 
 @router.post("/score")
+@with_cancellation
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
     if handler is None:
@@ -424,6 +431,7 @@ async def create_score(request: ScoreRequest, raw_request: Request):
 
 
 @router.post("/v1/score")
+@with_cancellation
 async def create_score_v1(request: ScoreRequest, raw_request: Request):
     logger.warning(
         "To indicate that Score API is not part of standard OpenAI API, we "
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 527418c635093..81bce0dd370bb 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -32,7 +32,6 @@
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
-from vllm.utils import iterate_with_cancellation
 
 logger = init_logger(__name__)
 
@@ -234,10 +233,6 @@ async def create_chat_completion(
         assert len(generators) == 1
         result_generator, = generators
 
-        if raw_request:
-            result_generator = iterate_with_cancellation(
-                result_generator, raw_request.is_disconnected)
-
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index bd39a4c42e938..5cf9df92e296e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -159,8 +159,7 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators, is_cancelled=raw_request.is_disconnected)
+        result_generator = merge_async_iterators(*generators)
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index fd501ad4f833e..879276646d2ba 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -202,10 +202,7 @@ async def create_embedding(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators,
-            is_cancelled=raw_request.is_disconnected if raw_request else None,
-        )
+        result_generator = merge_async_iterators(*generators)
 
         num_prompts = len(engine_prompts)
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 6f5cc14ac37cc..101d170bee4d6 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -186,10 +186,7 @@ async def create_score(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators,
-            is_cancelled=raw_request.is_disconnected if raw_request else None,
-        )
+        result_generator = merge_async_iterators(*generators)
 
         num_prompts = len(engine_prompts)
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
new file mode 100644
index 0000000000000..e8a78d216d0f0
--- /dev/null
+++ b/vllm/entrypoints/utils.py
@@ -0,0 +1,57 @@
+import asyncio
+import functools
+
+from fastapi import Request
+
+
+async def listen_for_disconnect(request: Request) -> None:
+    """Returns if a disconnect message is received"""
+    while True:
+        message = await request.receive()
+        if message["type"] == "http.disconnect":
+            break
+
+
+def with_cancellation(handler_func):
+    """Decorator that allows a route handler to be cancelled by client
+    disconnections.
+    
+    This does _not_ use request.is_disconnected, which does not work with
+    middleware. Instead this follows the pattern from 
+    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
+    to wait for an http disconnect message, and the other to do the work that we
+    want done. When the first task finishes, the other is cancelled.
+
+    A core assumption of this method is that the body of the request has already
+    been read. This is a safe assumption to make for fastapi handlers that have
+    already parsed the body of the request into a pydantic model for us.
+    This decorator is unsafe to use elsewhere, as it will consume and throw away
+    all incoming messages for the request while it looks for a disconnect
+    message.
+
+    In the case where a `StreamingResponse` is returned by the handler, this
+    wrapper will stop listening for disconnects and instead the response object
+    will start listening for disconnects.
+    """
+
+    # Functools.wraps is required for this wrapper to appear to fastapi as a
+    # normal route handler, with the correct request type hinting.
+    @functools.wraps(handler_func)
+    async def wrapper(*args, **kwargs):
+
+        # The request is either the second positional arg or `raw_request`
+        request = args[1] if len(args) > 1 else kwargs["raw_request"]
+
+        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
+        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
+
+        done, pending = await asyncio.wait([handler_task, cancellation_task],
+                                           return_when=asyncio.FIRST_COMPLETED)
+        for task in pending:
+            task.cancel()
+
+        if handler_task in done:
+            return handler_task.result()
+        return None
+
+    return wrapper
diff --git a/vllm/utils.py b/vllm/utils.py
index 73d2ae25f15ca..38c7dea6d2d3d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -20,7 +20,7 @@
 import uuid
 import warnings
 import weakref
-from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
+from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, field
@@ -370,72 +370,23 @@ def _next_task(iterator: AsyncGenerator[T, None],
     return loop.create_task(iterator.__anext__())  # type: ignore[arg-type]
 
 
-async def iterate_with_cancellation(
-    iterator: AsyncGenerator[T, None],
-    is_cancelled: Callable[[], Awaitable[bool]],
-) -> AsyncGenerator[T, None]:
-    """Convert async iterator into one that polls the provided function
-    at least once per second to check for client cancellation.
-    """
-
-    loop = asyncio.get_running_loop()
-
-    awaits: List[Future[T]] = [_next_task(iterator, loop)]
-    next_cancel_check: float = 0
-    while True:
-        done, pending = await asyncio.wait(awaits, timeout=1.5)
-
-        # Check for cancellation at most once per second
-        time_now = time.time()
-        if time_now >= next_cancel_check:
-            if await is_cancelled():
-                with contextlib.suppress(BaseException):
-                    awaits[0].cancel()
-                    await iterator.aclose()
-                raise asyncio.CancelledError("client cancelled")
-            next_cancel_check = time_now + 1
-
-        if done:
-            try:
-                item = await awaits[0]
-                awaits[0] = _next_task(iterator, loop)
-                yield item
-            except StopAsyncIteration:
-                # we are done
-                return
-
-
 async def merge_async_iterators(
-    *iterators: AsyncGenerator[T, None],
-    is_cancelled: Optional[Callable[[], Awaitable[bool]]] = None,
-) -> AsyncGenerator[Tuple[int, T], None]:
+    *iterators: AsyncGenerator[T,
+                               None], ) -> AsyncGenerator[Tuple[int, T], None]:
     """Merge multiple asynchronous iterators into a single iterator.
 
     This method handle the case where some iterators finish before others.
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
-
-    It also optionally polls a provided function at least once per second
-    to check for client cancellation.
     """
 
     loop = asyncio.get_running_loop()
 
     awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)}
-    timeout = None if is_cancelled is None else 1.5
-    next_cancel_check: float = 0
     try:
         while awaits:
-            done, pending = await asyncio.wait(awaits.keys(),
-                                               return_when=FIRST_COMPLETED,
-                                               timeout=timeout)
-            if is_cancelled is not None:
-                # Check for cancellation at most once per second
-                time_now = time.time()
-                if time_now >= next_cancel_check:
-                    if await is_cancelled():
-                        raise asyncio.CancelledError("client cancelled")
-                    next_cancel_check = time_now + 1
+            done, _ = await asyncio.wait(awaits.keys(),
+                                         return_when=FIRST_COMPLETED)
             for d in done:
                 pair = awaits.pop(d)
                 try:

From c77eb8a33ceb62858d951ffef87ae626a0d09973 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 17 Dec 2024 19:34:06 -0500
Subject: [PATCH 80/85] [Bugfix] Set temperature=0.7 in test_guided_choice_chat
 (#11264)

---
 tests/entrypoints/openai/test_chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 8d23a2be6f9bb..47c521a9b5eb5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -482,6 +482,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice1 = chat_completion.choices[0].message.content
@@ -496,6 +497,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice2 = chat_completion.choices[0].message.content

From bf8717ebaea8d74279df84fbe127ad22cf62e219 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 17 Dec 2024 16:37:59 -0800
Subject: [PATCH 81/85] [V1] Prefix caching for vision language models (#11187)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 tests/v1/core/test_prefix_caching.py |  88 +++++++++++++++++++-
 tests/v1/engine/test_engine_args.py  |  15 ----
 vllm/engine/arg_utils.py             |  27 ++++---
 vllm/inputs/data.py                  |  20 +++++
 vllm/multimodal/inputs.py            |   3 +
 vllm/v1/core/kv_cache_manager.py     |  74 +++++++++++------
 vllm/v1/core/kv_cache_utils.py       | 115 ++++++++++++++++++++++++---
 vllm/v1/core/scheduler.py            |   2 +
 vllm/v1/engine/async_llm.py          |  10 ++-
 vllm/v1/engine/core.py               |   8 +-
 vllm/v1/engine/llm_engine.py         |   9 ++-
 vllm/v1/engine/mm_input_mapper.py    |  33 ++++----
 vllm/v1/engine/processor.py          |  12 +--
 vllm/v1/request.py                   |  24 +++++-
 14 files changed, 342 insertions(+), 98 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 00f7b0fcfe1dc..ed04f0a373c51 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -2,16 +2,23 @@
 import pytest
 
 from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
 
 
-def make_request(request_id, prompt_token_ids):
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids),
+        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
+                            multi_modal_placeholders={"image": mm_positions}
+                            if mm_positions else None,
+                            multi_modal_hashes=mm_hashes),
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
@@ -38,6 +45,7 @@ def test_prefill():
     all_token_ids = common_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     computed_blocks = manager.get_computed_blocks(req0)
+    assert len(req0.kv_block_hashes) == 3
     assert not computed_blocks
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -61,6 +69,7 @@ def test_prefill():
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks = manager.get_computed_blocks(req1)
+    assert len(req1.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
@@ -90,6 +99,7 @@ def test_prefill():
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_block = manager.get_computed_blocks(req2)
+    assert len(req2.kv_block_hashes) == 3
     assert [b.block_id for b in computed_block] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
@@ -416,3 +426,77 @@ def test_cache_blocks():
     )
     assert len(manager.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
+
+
+def test_mm_prefix_caching():
+    """
+    This tests that the multi-modal prefix caching is correct.
+    """
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
+    # [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
+    common_token_ids = list(range(10)) + [-1] * 6
+    common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
+    common_token_ids += [-1] * 16
+
+    common_mm_positions = [
+        PlaceholderRange(offset=11, length=10),
+        PlaceholderRange(offset=30, length=18),
+    ]
+    common_mm_hashes = ["aaa", "bbb"]
+
+    # A unique image plus some text tokens.
+    unique_token_ids = [-1] * 7 + [100] * 4
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req0 = make_request("0",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks
+    assert len(req0.kv_block_hashes) == 3
+    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
+    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
+    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+
+    blocks = manager.allocate_slots(req0, 59, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.append_slots(req0, 5)
+    assert new_blocks is not None and len(new_blocks) == 0
+
+    # The just completed block should have hashes with extra keys.
+    assert len(req0.kv_block_hashes) == 4
+    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+
+    # Cache hit.
+    unique_token_ids = [-1] * 7 + [200] * 5
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req1 = make_request("1",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert len(computed_blocks) == 3
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index ac5e7dde525a7..ff38a4568ecb1 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -31,14 +31,6 @@ def test_prefix_caching_from_cli():
     assert engine_args.enable_prefix_caching
 
 
-def test_defaults():
-    engine_args = EngineArgs(model="facebook/opt-125m")
-
-    # Assert V1 defaults
-    assert (engine_args.enable_prefix_caching
-            ), "V1 turns on prefix caching by default"
-
-
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config: VllmConfig = engine_args.create_engine_config(
@@ -52,10 +44,3 @@ def test_defaults_with_usage_context():
         UsageContext.OPENAI_API_SERVER)
     assert vllm_config.scheduler_config.max_num_seqs == 1024
     assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
-
-
-def test_prefix_cache_disabled_with_multimodel():
-    engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
-
-    vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
-    assert not vllm_config.cache_config.enable_prefix_caching
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f6d276fe7c0c8..674577f23eba6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -205,6 +205,7 @@ def __post_init__(self):
         # by user.
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
         # Override max_num_seqs if it's not set by user.
         if self.max_num_seqs is None:
             self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
@@ -1026,11 +1027,11 @@ def create_engine_config(self,
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
-        if model_config.is_multimodal_model:
-            if self.enable_prefix_caching:
-                logger.warning(
-                    "--enable-prefix-caching is currently not "
-                    "supported for multimodal models and has been disabled.")
+        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
+                and self.enable_prefix_caching):
+            logger.warning("--enable-prefix-caching is currently not "
+                           "supported for multimodal models in v0 and "
+                           "has been disabled.")
             self.enable_prefix_caching = False
 
         cache_config = CacheConfig(
@@ -1249,11 +1250,14 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         # When no user override, set the default values based on the usage
         # context.
         # TODO(woosuk): Tune the default values for different hardware.
-        if self.max_num_batched_tokens is None:
-            if usage_context == UsageContext.LLM_CLASS:
-                self.max_num_batched_tokens = 8192
-            elif usage_context == UsageContext.OPENAI_API_SERVER:
-                self.max_num_batched_tokens = 2048
+        default_max_num_batched_tokens = {
+            UsageContext.LLM_CLASS: 8192,
+            UsageContext.OPENAI_API_SERVER: 2048,
+        }
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            self.max_num_batched_tokens = default_max_num_batched_tokens[
+                usage_context]
             logger.warning(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, usage_context.value)
@@ -1263,9 +1267,6 @@ def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
         Override the EngineConfig's configs based on the usage context for V1.
         """
         assert envs.VLLM_USE_V1, "V1 is not enabled"
-        if engine_config.model_config.is_multimodal_model:
-            # TODO (ywang96): Enable APC by default when VLM supports it.
-            assert not engine_config.cache_config.enable_prefix_caching
 
 
 @dataclass
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 85aaaa776907f..d54cbb5c37819 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -162,6 +162,11 @@ class TokenInputs(TypedDict):
     Placeholder ranges for the multi-modal data.
     """
 
+    multi_modal_hashes: NotRequired[List[str]]
+    """
+    The hashes of the multi-modal data.
+    """
+
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
@@ -177,6 +182,7 @@ def token_inputs(
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
     multi_modal_inputs: Optional["MultiModalKwargs"] = None,
+    multi_modal_hashes: Optional[List[str]] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
@@ -191,6 +197,8 @@ def token_inputs(
         inputs["multi_modal_data"] = multi_modal_data
     if multi_modal_inputs is not None:
         inputs["multi_modal_inputs"] = multi_modal_inputs
+    if multi_modal_hashes is not None:
+        inputs["multi_modal_hashes"] = multi_modal_hashes
     if multi_modal_placeholders is not None:
         inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
@@ -295,6 +303,18 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
 
         assert_never(inputs)
 
+    @cached_property
+    def multi_modal_hashes(self) -> List[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_hashes", [])
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_hashes", [])
+
+        assert_never(inputs)
+
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         inputs = self.inputs
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 229a8fbdf5831..c00943a5f26d9 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -215,6 +215,9 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
+    mm_hashes: NotRequired[List[str]]
+    """The hashes of the multi-modal data."""
+
     mm_placeholders: MultiModalPlaceholderDict
     """
     For each modality, information about the placeholder tokens in
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index aaa44c930e324..61a3f5fd6d841 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -4,7 +4,9 @@
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, hash_block_tokens,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
                                          hash_request_tokens)
 from vllm.v1.request import Request
 
@@ -83,10 +85,12 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
 
         computed_blocks = []
 
-        # TODO(rickyx): potentially we could cache this so we don't have to
-        # recompute it every time.
-        block_hashes = hash_request_tokens(self.block_size,
-                                           request.all_token_ids)
+        # The block hashes for the request may already be computed
+        # if the request was preempted and resumed.
+        if not request.kv_block_hashes:
+            request.set_kv_block_hashes(
+                hash_request_tokens(self.block_size, request))
+        block_hashes = request.kv_block_hashes
 
         for block_hash in block_hashes:
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -242,14 +246,16 @@ def allocate_slots(
         num_computed_tokens = len(computed_blocks) * self.block_size
         num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
 
-        self._cache_full_blocks(
-            request=request,
-            blk_start_idx=len(computed_blocks),
-            # The new full blocks are the full blocks that are not computed.
-            full_blocks=self.req_to_blocks[request.request_id]
-            [len(computed_blocks):num_full_blocks],
-            prev_block=computed_blocks[-1] if computed_blocks else None,
-        )
+        new_full_blocks = self.req_to_blocks[
+            request.request_id][len(computed_blocks):num_full_blocks]
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=len(computed_blocks),
+                # The new full blocks are the full blocks that are not computed.
+                full_blocks=new_full_blocks,
+                prev_block=computed_blocks[-1] if computed_blocks else None,
+            )
 
         return new_blocks
 
@@ -376,6 +382,8 @@ def _cache_full_blocks(
             full_blocks: The list of blocks to update hash metadata.
             prev_block: The previous block in the chain.
         """
+        num_cached_block_hashes = len(request.kv_block_hashes)
+
         # Update the new blocks with the block hashes through the chain.
         prev_block_hash_value = None
         if prev_block is not None:
@@ -387,17 +395,35 @@ def _cache_full_blocks(
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 
-            block_tokens = request.all_token_ids[blk_idx *
-                                                 self.block_size:(blk_idx +
-                                                                  1) *
-                                                 self.block_size]
-            assert len(block_tokens) == self.block_size, (
-                f"Expected {self.block_size} tokens, got {len(block_tokens)} "
-                f"at {blk_idx}th block for request "
-                f"{request.request_id}({request})")
-
-            # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash_value, block_tokens)
+            if blk_idx < num_cached_block_hashes:
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption). In this case we simply
+                # reuse the block hash.
+                block_hash = request.kv_block_hashes[blk_idx]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                start_token_idx = blk_idx * self.block_size
+                end_token_idx = (blk_idx + 1) * self.block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == self.block_size, (
+                    f"Expected {self.block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                request.append_kv_block_hashes(block_hash)
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 0ba338aa5a3d2..d80ea128c7749 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,20 +1,25 @@
 """KV-Cache Utilities."""
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import List, NamedTuple, Optional, Tuple
+from typing import Any, List, NamedTuple, Optional, Tuple
 
 from vllm.logger import init_logger
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
 
 class BlockHashType(NamedTuple):
-    """Hash value of a block and the token IDs in the block.
-    The reason we keep a tuple of token IDs is to make sure no hash
-    collision happens when the hash value is the same.
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    The reason we keep a tuple of token IDs and extra keys is to make sure
+    no hash collision happens when the hash value is the same.
     """
+    # Hash value of the block in an integer.
     hash_value: int
+    # Token IDs in the block.
     token_ids: Tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
 
 
 @dataclass
@@ -159,8 +164,80 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
         return ret
 
 
-def hash_block_tokens(parent_block_hash: Optional[int],
-                      curr_block_token_ids: Sequence[int]) -> BlockHashType:
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+    For multi-modal inputs, the extra keys are (mm_hash, start_offset) that
+    indicate a mm input contained in the block and its starting offset in
+    the block tokens.
+    
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+    
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if not mm_positions:
+        return None, start_mm_idx
+
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match. This "
+            "is likely because you do not enable MM preprocessor hashing. "
+            "Please set mm_cache_preprocessor=True.")
+
+    # Note that we assume mm_positions is sorted by offset.
+    # We do not need to check all mm inputs if the start token index is out of
+    # range. This usually happens in the late prefill phase and decoding phase.
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
+        return None, start_mm_idx
+
+    # Support start_mm_idx == -1 to indicate the last mm input.
+    if start_mm_idx < 0:
+        assert -start_mm_idx <= len(mm_positions)
+        start_mm_idx = len(mm_positions) + start_mm_idx
+
+    extra_keys = []
+    curr_mm_idx = start_mm_idx
+    while mm_positions and curr_mm_idx < len(mm_positions):
+        assert mm_hashes[curr_mm_idx] is not None
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
+        if end_token_idx > offset:
+            if start_token_idx > offset + length:
+                # This block has passed the current mm input.
+                curr_mm_idx += 1
+                continue
+
+            # The block contains the current mm input.
+            mm_start = max(0, start_token_idx - offset)
+            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            if end_token_idx >= offset + length:
+                # If this block contains the end of the current mm input,
+                # move to the next mm input as this block may also contain
+                # the next mm input.
+                curr_mm_idx += 1
+            else:
+                # Otherwise this block is done with mm inputs.
+                break
+        else:
+            # This block has not reached the current mm input.
+            break
+    return tuple(extra_keys), curr_mm_idx
+
+
+def hash_block_tokens(
+        parent_block_hash: Optional[int],
+        curr_block_token_ids: Sequence[int],
+        extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -174,27 +251,39 @@ def hash_block_tokens(parent_block_hash: Optional[int],
             if this is the first block.
         curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
+        extra_keys: Extra keys for the block.
 
     Returns:
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
     return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         tuple(curr_block_token_ids))
+                         tuple(curr_block_token_ids), extra_keys)
 
 
 def hash_request_tokens(block_size: int,
-                        token_ids: Sequence[int]) -> List[BlockHashType]:
+                        request: Request) -> List[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
     Args:
         block_size: The size of each block.
-        token_ids: A sequence of token ids in the request.
+        request: The request object.
 
     Returns:
         The list of computed hash values.
     """
+    token_ids = request.all_token_ids
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match.")
+
+    # TODO: Extend this to support other features such as LoRA.
+    need_extra_keys = bool(mm_positions)
+    extra_keys = None
+    curr_mm_idx = 0
+
     ret = []
     parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
@@ -203,8 +292,14 @@ def hash_request_tokens(block_size: int,
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
+
+        # Add extra keys if the block is a multi-modal block.
+        if need_extra_keys:
+            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start, end, curr_mm_idx)
+
         block_hash = hash_block_tokens(parent_block_hash_value,
-                                       block_token_ids)
+                                       block_token_ids, extra_keys)
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
     return ret
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 178532e477dae..08e7c0fd4dc9b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -516,6 +516,7 @@ class NewRequestData:
     prompt_token_ids: List[int]
     prompt: Optional[str]
     mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
     mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     block_ids: List[int]
@@ -533,6 +534,7 @@ def from_request(
             prompt_token_ids=request.prompt_token_ids,
             prompt=request.prompt,
             mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b36de5f66917c..41fb4b25d45bb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -60,9 +60,13 @@ def __init__(
         self.client_aborted_requests: List[str] = []
 
         # Processor (converts Inputs --> EngineCoreRequests).
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry)
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry,
+        )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
         self.detokenizer = Detokenizer(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 56d4dc67e4a0e..497d5db5b4c99 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -65,7 +65,8 @@ def __init__(
 
         self._last_logging_time = time.time()
 
-        self.mm_input_mapper_server = MMInputMapperServer()
+        self.mm_input_mapper_server = MMInputMapperServer(
+            vllm_config.model_config)
 
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
@@ -98,9 +99,8 @@ def add_request(self, request: EngineCoreRequest):
             # MM mapper, so anything that has a hash must have a HIT cache
             # entry here as well.
             assert request.mm_inputs is not None
-            request.mm_inputs, request.mm_hashes = (
-                self.mm_input_mapper_server.process_inputs(
-                    request.mm_inputs, request.mm_hashes))
+            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+                request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 15dedbd0f9529..bea8c5502f612 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -55,9 +55,12 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry, mm_registry)
+        self.processor = Processor(model_config=vllm_config.model_config,
+                                   cache_config=vllm_config.cache_config,
+                                   lora_config=vllm_config.lora_config,
+                                   tokenizer=self.tokenizer,
+                                   input_registry=input_registry,
+                                   mm_registry=mm_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
         self.detokenizer = Detokenizer(
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 6cdeba6f3f71e..e53ba092ede04 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 import PIL
 from blake3 import blake3
@@ -42,6 +42,8 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
+        # Init cache
+        self.use_cache = model_config.mm_cache_preprocessor
         self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
@@ -61,7 +63,7 @@ def process_inputs(
         mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
         precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
-    ) -> Tuple[List[MultiModalKwargs], Optional[List[str]]]:
+    ) -> List[MultiModalKwargs]:
         if precomputed_mm_inputs is None:
             image_inputs = mm_data["image"]
             if not isinstance(image_inputs, list):
@@ -70,26 +72,21 @@ def process_inputs(
         else:
             num_inputs = len(precomputed_mm_inputs)
 
-        # Check if hash is enabled
-        use_hash = mm_hashes is not None
-        if use_hash:
+        # Sanity
+        if self.use_cache:
             assert mm_hashes is not None
-            assert num_inputs == len(
-                mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format(
-                    num_inputs, len(mm_hashes))
+            assert num_inputs == len(mm_hashes)
 
         # Process each image input separately, so that later we can schedule
         # them in a fine-grained manner.
         # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_hashes: Optional[List[str]] = [] if use_hash else None
         ret_inputs: List[MultiModalKwargs] = []
         for input_id in range(num_inputs):
             if self.mm_debug_cache_hit_ratio_steps is not None:
                 self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
 
-            mm_hash = None
             mm_input = None
-            if use_hash:
+            if self.use_cache:
                 assert mm_hashes is not None
                 mm_hash = mm_hashes[input_id]
                 mm_input = self.mm_cache.get(mm_hash)
@@ -106,7 +103,7 @@ def process_inputs(
                         mm_processor_kwargs=mm_processor_kwargs,
                     )
 
-                if use_hash:
+                if self.use_cache:
                     # Add to cache
                     assert mm_hash is not None
                     self.mm_cache.put(mm_hash, mm_input)
@@ -114,18 +111,15 @@ def process_inputs(
                 self.mm_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
 
-            if use_hash:
-                assert mm_hash is not None
-                assert ret_hashes is not None
-                ret_hashes.append(mm_hash)
             ret_inputs.append(mm_input)
 
-        return ret_inputs, ret_hashes
+        return ret_inputs
 
 
 class MMInputMapperServer:
 
-    def __init__(self, ):
+    def __init__(self, model_config):
+        self.use_cache = model_config.mm_cache_preprocessor
         self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
     def process_inputs(
@@ -135,6 +129,9 @@ def process_inputs(
     ) -> List[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
+        if not self.use_cache:
+            return mm_inputs
+
         full_mm_inputs = []
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
             assert mm_hash is not None
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 679bf8e25e9ca..732757d6b0ac2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,7 +1,7 @@
 import time
 from typing import Any, Dict, Mapping, Optional, Tuple, Union
 
-from vllm.config import LoRAConfig, ModelConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs
@@ -23,6 +23,7 @@ class Processor:
     def __init__(
         self,
         model_config: ModelConfig,
+        cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
@@ -45,8 +46,9 @@ def __init__(
         self.mm_input_mapper_client = MMInputMapperClient(model_config)
 
         # Multi-modal hasher (for images)
-        self.mm_hasher = MMHasher(
-        ) if model_config.mm_cache_preprocessor else None
+        self.use_hash = model_config.mm_cache_preprocessor or \
+            cache_config.enable_prefix_caching
+        self.mm_hasher = MMHasher()
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -77,7 +79,7 @@ def process_inputs(
 
         # Compute MM hashes (if enabled)
         mm_hashes = None
-        if self.mm_hasher is not None:
+        if self.use_hash:
             mm_hashes = self.mm_hasher.hash(prompt)
 
         # Process inputs.
@@ -118,7 +120,7 @@ def process_inputs(
         # Apply MM mapper
         mm_inputs = None
         if len(decoder_inputs.multi_modal_data) > 0:
-            mm_inputs, mm_hashes = self.mm_input_mapper_client.process_inputs(
+            mm_inputs = self.mm_input_mapper_client.process_inputs(
                 decoder_inputs.multi_modal_data, mm_hashes,
                 decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 1737d096e811d..f4783ae366ef0 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,5 +1,5 @@
 import enum
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
@@ -9,6 +9,9 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_utils import BlockHashType
+
 
 class Request:
 
@@ -45,6 +48,7 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
+        # Multi-modal input metadata.
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
             # FIXME(woosuk): Support other modalities.
@@ -56,6 +60,12 @@ def __init__(
         if self.inputs.multi_modal_inputs:
             self.mm_inputs = self.inputs.multi_modal_inputs
 
+        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+
+        # Cache the computed kv block hashes of the request to avoid
+        # recomputing.
+        self._kv_block_hashes: List[BlockHashType] = []
+
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
@@ -65,6 +75,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                 prompt=request.prompt,
                 multi_modal_data=None,
                 multi_modal_inputs=request.mm_inputs,
+                multi_modal_hashes=request.mm_hashes,
                 multi_modal_placeholders=request.mm_placeholders,
                 mm_processor_kwargs=None,
             ),
@@ -121,6 +132,17 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
+    @property
+    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
+        # Prevent directly appending to the kv_block_hashes.
+        return ConstantList(self._kv_block_hashes)
+
+    def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
+        self._kv_block_hashes = value
+
+    def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
+        self._kv_block_hashes.append(block_hash)
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""

From 866fa4550d572f4ff3521ccf503e0df2e76591a1 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 18 Dec 2024 01:39:07 +0100
Subject: [PATCH 82/85] [Bugfix] Restore support for larger block sizes
 (#11259)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>
---
 vllm/config.py           | 4 ++++
 vllm/engine/arg_utils.py | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 9ecd3e72afa9f..307cf9c8d5b2a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -917,6 +917,10 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        if (current_platform.is_cuda() and self.block_size is not None
+                and self.block_size > 32):
+            raise ValueError("CUDA Paged Attention kernel only supports "
+                             f"block sizes up to 32. Got {self.block_size}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 674577f23eba6..64cc4592c2861 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -424,10 +424,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len')
+                            'set to max-model-len. On CUDA devices, '
+                            'only block sizes up to 32 are supported. '
+                            'On HPU devices, block size defaults to 128.')
 
         parser.add_argument(
             "--enable-prefix-caching",

From 8b79f9e107fd4214187bf65485b3ea1bb3191a46 Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 18 Dec 2024 03:34:08 -0300
Subject: [PATCH 83/85] [Bugfix] Fix guided decoding with tokenizer mode
 mistral (#11046)

---
 .buildkite/test-pipeline.yaml                 |   6 +-
 requirements-common.txt                       |   3 +-
 .../model_executor/test_guided_processors.py  |  54 ++++++++-
 .../decoder_only/language/test_mistral.py     |  86 ++++++++++++-
 .../guided_decoding/xgrammar_decoding.py      | 113 +++++++++++-------
 vllm/transformers_utils/tokenizer.py          |   2 +-
 vllm/transformers_utils/tokenizers/mistral.py |   5 +-
 7 files changed, 217 insertions(+), 52 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 44f47fac1c1b3..b563c96343f92 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -224,8 +224,12 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
   - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
+  - tests/model_executor/test_guided_processors
+  commands: 
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
 
 - label: Speculative decoding tests # 30min
   source_file_dependencies:
diff --git a/requirements-common.txt b/requirements-common.txt
index bd2b4b7a01668..1c935303c8d79 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -14,12 +14,13 @@ aiohttp
 openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
-pillow  # Required for image processing
 prometheus_client >= 0.18.0
+pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines == 0.1.11
+lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 9f4d81b583141..3334c0df149b5 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,13 +1,19 @@
+import pickle
+
 import pytest
 import torch
 from transformers import AutoTokenizer
 
+from vllm.config import ModelConfig
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    get_guided_decoding_logits_processor,
+    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
 
+MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
+
 
 def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
@@ -38,14 +44,29 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+@pytest.mark.parametrize("is_local", [True, False])
+async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
+                                                 sample_regex,
                                                  sample_json_schema):
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+
+    config = ModelConfig(
+        MODEL_NAME,
+        task="generate",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     token_ids = tokenizer.encode(
         f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-    regex_lp = await get_guided_decoding_logits_processor(
-        regex_request, tokenizer)
+
+    regex_lp = get_local_guided_decoding_logits_processor(
+            regex_request, tokenizer, config) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, tokenizer, config)
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -59,7 +80,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
     json_request = GuidedDecodingParams(json=sample_json_schema,
                                         backend=backend)
     json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer)
+        json_request, tokenizer, config)
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -84,3 +105,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
     with pytest.raises(ValueError,
                        match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
+
+
+def test_pickle_xgrammar_tokenizer_data():
+
+    # TODO: move to another test file for xgrammar
+    try:
+        import xgrammar as xgr
+    except ImportError:
+        pytest.skip("Could not import xgrammar to run test")
+
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+        TokenizerData)
+    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    pickled = pickle.dumps(tokenizer_data)
+
+    assert pickled is not None
+
+    depickled: TokenizerData = pickle.loads(pickled)
+
+    assert depickled is not None
+    assert depickled.vocab_type == xgr.VocabType.RAW
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 99b5d5694f9f7..bdc1571784b5d 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -3,17 +3,20 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import copy
+import json
 
+import jsonschema
+import jsonschema.exceptions
 import pytest
 
-from vllm import SamplingParams
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
     MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
 
 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
 MISTRAL_FORMAT_MODELS = [
@@ -126,6 +129,45 @@
     }
 ]
 
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -251,3 +293,43 @@ def test_mistral_function_calling(
         assert parsed_message.tool_calls[
             0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
         assert parsed_message.content is None
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with vllm_runner(model, dtype='bfloat16',
+                     tokenizer_mode="mistral") as vllm_model:
+
+        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
+                                               backend=guided_backend)
+        params = SamplingParams(max_tokens=512,
+                                temperature=0.7,
+                                guided_decoding=guided_decoding)
+
+        messages = [{
+            "role": "system",
+            "content": "you are a helpful assistant"
+        }, {
+            "role":
+            "user",
+            "content":
+            f"Give an example JSON for an employee profile that "
+            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+        }]
+        outputs = vllm_model.model.chat(messages, sampling_params=params)
+
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index fc45e37cf6f06..5b97f03257502 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -3,7 +3,7 @@
 
 import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import TYPE_CHECKING, Any
 
 import torch
 from transformers import PreTrainedTokenizerFast
@@ -16,6 +16,7 @@
 
 from vllm.model_executor.guided_decoding.xgrammar_utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -37,11 +38,21 @@ def get_local_xgrammar_guided_decoding_logits_processor(
     return XGrammarLogitsProcessor(config)
 
 
-class TokenizerData(NamedTuple):
+@dataclass(frozen=True)
+class TokenizerData:
     """Immutable container for cached tokenizer data."""
-    encoded_vocab: list[str]
-    stop_token_ids: list[int] | None
-    backend_str: str
+    encoded_vocab: list[str] = field(default_factory=list)
+    stop_token_ids: list[int] | None = None
+    # These fields are mutually exclusive: `backend_str` is used to create a
+    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
+    # used within the constructor of TokenizeInfo
+    backend_str: str | None = None
+    vocab_type: xgr.VocabType | None = None
+
+    def __post_init__(self):
+        # Check for mutual exclusive
+        assert not (self.backend_str and self.vocab_type), \
+            "backend_str and vocab_type are mutual exclusive"
 
 
 class TokenizerDataCache:
@@ -68,18 +79,27 @@ def get_tokenizer_data(cls,
                     "get_vocab method.") from e
 
             stop_token_ids = None
-            backend_str = xgr.VocabType.RAW
+            backend_str = ""
+            vocab_type = xgr.VocabType.RAW
+
+            if stop_token_ids is None and hasattr(
+                    tokenizer,
+                    "eos_token_id") and tokenizer.eos_token_id is not None:
+                stop_token_ids = [tokenizer.eos_token_id]
+
             if isinstance(tokenizer, PreTrainedTokenizerFast):
                 backend_str = tokenizer.backend_tokenizer.to_str()
-                if stop_token_ids is None and hasattr(
-                        tokenizer,
-                        "eos_token_id") and tokenizer.eos_token_id is not None:
-                    stop_token_ids = [tokenizer.eos_token_id]
+                vocab_type = None
+
+            elif isinstance(tokenizer, MistralTokenizer):
+                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type = xgr.VocabType.BYTE_FALLBACK
 
             cls._cache[tokenizer_hash] = TokenizerData(
                 encoded_vocab=encoded_vocab,
                 stop_token_ids=stop_token_ids,
-                backend_str=backend_str)
+                backend_str=backend_str,
+                vocab_type=vocab_type)
 
         return cls._cache[tokenizer_hash]
 
@@ -98,11 +118,30 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
         cache_key = str(config.tokenizer_hash)
 
         if cache_key not in cls._cache:
-            assert config.encoded_vocab is not None
-            tokenizer_info = xgr.TokenizerInfo._create_from_handle(
-                xgr_core.TokenizerInfo.from_huggingface(
-                    config.encoded_vocab, config.backend_str,
-                    config.vocab_size, config.stop_token_ids))
+            assert config.tokenizer_data is not None
+            assert config.tokenizer_data.encoded_vocab is not None
+
+            config_data = config.tokenizer_data
+
+            # In TokenizerDataCache.get_tokenizer_data, a serializable
+            # tokenizer_data is created and cached. This data is used to build
+            # a tokenizer_info and create an xgrammar compiler.
+            # - If tokenizer_data has backend_str set, use
+            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
+            # - Otherwise, use the default constructor with vocab_type.
+            # - xgr_core.TokenizerInfo.from_huggingface !=
+            #   xgr.TokenizerInfo.from_huggingface.
+            if config_data.backend_str:
+                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                    xgr_core.TokenizerInfo.from_huggingface(
+                        config_data.encoded_vocab, config_data.backend_str,
+                        config.vocab_size, config_data.stop_token_ids))
+            else:
+                tokenizer_info = xgr.TokenizerInfo(
+                    config_data.encoded_vocab,
+                    config_data.vocab_type,
+                    vocab_size=config.vocab_size,
+                    stop_token_ids=config_data.stop_token_ids)
             cls._cache[cache_key] = xgr.GrammarCompiler(
                 tokenizer_info, max_threads=config.max_threads)
 
@@ -118,10 +157,7 @@ class GrammarConfig:
     grammar_str: str | None = None
     json_object: bool | None = None
     max_threads: int = 8
-    # Only populated if tokenizer_hash not in cache
-    encoded_vocab: list[str] | None = None
-    stop_token_ids: list[int] | None = None
-    backend_str: str | None = None
+    tokenizer_data: TokenizerData | None = None
 
     @classmethod
     def from_guided_params(cls,
@@ -132,9 +168,6 @@ def from_guided_params(cls,
 
         tokenizer_hash = hash(tokenizer)
         tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
-        encoded_vocab = tokenizer_data.encoded_vocab
-        stop_token_ids = tokenizer_data.stop_token_ids
-        backend_str = tokenizer_data.backend_str
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
@@ -152,11 +185,9 @@ def from_guided_params(cls,
 
             return cls(json_str=json_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.grammar:
             # XGrammar only supports GBNF grammars, so we must convert Lark
             if grammar_is_likely_lark(guided_params.grammar):
@@ -181,19 +212,17 @@ def from_guided_params(cls,
 
             return cls(grammar_str=grammar_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.json_object:
-            return cls(json_object=True,
-                       vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+            return cls(
+                json_object=True,
+                vocab_size=model_config.hf_text_config.vocab_size,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
@@ -269,10 +298,14 @@ def __call__(self, input_ids: list[int],
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
         if device_type != "cuda":
-            scores = scores.to("cpu")
+            scores = scores.to("cpu").unsqueeze(0)
+
+        # Note: In this method, if the tensors have different dimensions
+        # on CPU device fails, but on GPU it runs without error. Hence the
+        # unsqueeze above for scores, to match the token bitmask shape
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type)
+            scores = scores.to(device_type).squeeze()
 
         return scores
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 54f9f895fe541..e6701f4c4b835 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -132,7 +132,7 @@ def get_tokenizer(
     if is_from_mistral_org and tokenizer_mode != "mistral":
         warnings.warn(
             'It is strongly recommended to run mistral models with '
-            '`--tokenizer_mode "mistral"` to ensure correct '
+            '`--tokenizer-mode "mistral"` to ensure correct '
             'encoding and decoding.',
             FutureWarning,
             stacklevel=2)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 83b3c37d6f04c..17d722e3d88fe 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -314,12 +314,15 @@ def _token_to_id(t: str):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.decode(regular_tokens))  # type: ignore
+                    self.tokenizer.decode(regular_tokens))  # type: ignore
 
             decoded = ''.join(decoded_list)
 
         return decoded
 
+    # WARN: Outlines logits processors can overwrite this method.
+    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
+    # for more.
     def decode(self,
                ids: Union[List[int], int],
                skip_special_tokens: bool = True) -> str:

From f04e407e6b6b9ce65c16cffda836f05c2ad32682 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Wed, 18 Dec 2024 14:34:23 +0800
Subject: [PATCH 84/85] [MISC][XPU]update ipex link for CI fix (#11278)

---
 requirements-xpu.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index e41295792283f..42c6c321d040c 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -9,8 +9,8 @@ setuptools-scm>=8
 wheel
 jinja2
 
-torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu == 3.0.0b1

From 60508ffda91c22e4cde3b18f149d222211db8886 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 18 Dec 2024 09:57:16 -0500
Subject: [PATCH 85/85] [Kernel]: Cutlass 2:4 Sparsity + FP8/Int8 Quant Support
 (#10995)

Co-authored-by: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Co-authored-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Rahul Tuli <rahul@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 CMakeLists.txt                                |  26 +-
 .../cutlass_benchmarks/sparse_benchmarks.py   | 384 ++++++++++++++
 benchmarks/cutlass_benchmarks/utils.py        |  96 ++++
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  28 +-
 .../cutlass_benchmarks/weight_shapes.py       |   2 +-
 csrc/core/math.hpp                            |   7 +
 csrc/cutlass_extensions/common.cpp            |  11 +
 csrc/cutlass_extensions/common.hpp            |  35 ++
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |   4 +-
 csrc/ops.h                                    |   9 +
 csrc/quantization/cutlass_w8a8/common.hpp     |  27 -
 .../cutlass_w8a8/scaled_mm_c2x.cuh            |   3 +-
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   3 +-
 .../cutlass_w8a8/scaled_mm_entry.cu           |  12 +-
 csrc/sparse/cutlass/sparse_compressor_c3x.cu  | 163 ++++++
 .../sparse/cutlass/sparse_compressor_entry.cu |  42 ++
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 303 +++++++++++
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 496 ++++++++++++++++++
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  59 +++
 csrc/torch_bindings.cpp                       |  15 +
 pyproject.toml                                |   2 +-
 tests/kernels/test_semi_structured.py         | 131 +++++
 tests/quantization/test_compressed_tensors.py | 103 +++-
 tests/weight_loading/models.txt               |   2 +
 .../run_model_weight_loading_test.sh          |   4 +
 tests/weight_loading/test_weight_loading.py   |   7 +
 vllm/_custom_ops.py                           | 103 ++++
 .../compressed_tensors/compressed_tensors.py  | 187 ++++++-
 .../compressed_tensors/schemes/__init__.py    |  15 +-
 .../schemes/compressed_tensors_24.py          | 203 +++++++
 30 files changed, 2365 insertions(+), 117 deletions(-)
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_benchmarks.py
 create mode 100644 benchmarks/cutlass_benchmarks/utils.py
 create mode 100644 csrc/core/math.hpp
 create mode 100644 csrc/cutlass_extensions/common.cpp
 create mode 100644 csrc/cutlass_extensions/common.hpp
 delete mode 100644 csrc/quantization/cutlass_w8a8/common.hpp
 create mode 100644 csrc/sparse/cutlass/sparse_compressor_c3x.cu
 create mode 100644 csrc/sparse/cutlass/sparse_compressor_entry.cu
 create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
 create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
 create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
 create mode 100644 tests/kernels/test_semi_structured.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf19b3d227171..51b49a18dddf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,7 +206,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        GIT_SHALLOW FALSE
     )
   endif()
   FetchContent_MakeAvailable(cutlass)
@@ -241,7 +241,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
+    "csrc/cutlass_extensions/common.cpp")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -271,11 +274,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+  # For Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+             "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -284,12 +290,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
+                     "later if you intend on running FP8 sparse or quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building cutlass_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -404,7 +410,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000000000..3d1c5e392f9e2
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,384 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass sparse impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass sparse with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
new file mode 100644
index 0000000000000..ef06fcd6604dd
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -0,0 +1,96 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 63cf5d50cac75..d0353bc8cb42a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,6 +8,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
@@ -17,31 +18,6 @@
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
 
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
@@ -386,4 +362,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 25ec9d6028627..d58fb0bf86374 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -40,4 +40,4 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
-}
+}
\ No newline at end of file
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
new file mode 100644
index 0000000000000..ba9f40a230c8e
--- /dev/null
+++ b/csrc/core/math.hpp
@@ -0,0 +1,7 @@
+#include <climits>
+#include <iostream>
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.cpp b/csrc/cutlass_extensions/common.cpp
new file mode 100644
index 0000000000000..3d2093ab94297
--- /dev/null
+++ b/csrc/cutlass_extensions/common.cpp
@@ -0,0 +1,11 @@
+#include "cutlass_extensions/common.hpp"
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 0000000000000..85e359aa57113
--- /dev/null
+++ b/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+int32_t get_sm_version_num();
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 95764ecddc79f..fcc17c7727f94 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -36,13 +36,13 @@ struct ScaledEpilogueBase {
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
diff --git a/csrc/ops.h b/csrc/ops.h
index 816b471d062d2..c145e4eda0845 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -162,6 +162,15 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
+                              torch::Tensor const& b, torch::Tensor const& e,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
+                                   torch::Tensor& e, torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
deleted file mode 100644
index bf04bb400790f..0000000000000
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include <climits>
-
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index d03242f44ab1d..75681f7f37820 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,7 +21,8 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 33581a63d4c3d..8190277997161 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -24,7 +24,8 @@
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 97a969cf5e3e0..4f7b6588ef3f7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -3,6 +3,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 
+#include "cutlass_extensions/common.hpp"
+
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -79,16 +81,6 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
-int32_t get_sm_version_num() {
-  int32_t major_capability, minor_capability;
-  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
-                         0);
-  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
-                         0);
-  int32_t version_num = major_capability * 10 + minor_capability;
-  return version_num;
-}
-
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
new file mode 100644
index 0000000000000..218c5317b4de6
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
@@ -0,0 +1,163 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include "sparse_scaled_mm_c3x.cuh"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename ElementA_, typename ElementAcc_>
+bool cutlass_sparse_compress(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                             torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  int m = a.size(0);
+  int k = a.size(1);
+
+  // Sparse kernel setup; this kernel is not used for matmul,
+  // but just for setting up the compressor utility
+  // A matrix configuration
+  using ElementA = ElementA_;
+  using LayoutTagA = cutlass::layout::RowMajor;
+  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  // B matrix configuration
+  using ElementB = ElementA;
+  using LayoutTagB = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  // C/D matrix configuration
+  using ElementC = float;
+  using LayoutTagC = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Core kernel configurations
+  using ElementAccumulator = ElementAcc_;
+  using TileShape = Shape<_128, _128, _128>;
+  using TileShapeRef = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using KernelSchedule = typename std::conditional<
+      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
+      cutlass::gemm::KernelTmaWarpSpecialized>::type;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using ProblemShape = Shape<int, int, int, int>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
+          AlignmentC, ElementC, LayoutTagC, AlignmentC,
+          EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
+          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  using StrideE = StrideA;
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+
+  // The n (=1) dimension does not matter for the compressor
+  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // Offline compressor kernel
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
+
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig,
+          cutlass::arch::Sm90>;
+
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA stride_A;
+  stride_A =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+
+  CompressorUtility compressor_utility(prob_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(
+      a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, stride_A, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a) {
+  if (a.dtype() == torch::kBFloat16) {
+    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_nzs, a_meta,
+                                                               a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return cutlass_sparse_compress<cutlass::half_t, float>(a_nzs, a_meta, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_nzs, a_meta,
+                                                                 a);
+  } else if (a.dtype() == torch::kInt8) {
+    return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
+  }
+  return false;
+}
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
new file mode 100644
index 0000000000000..d23d937b6ac28
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_entry.cu
@@ -0,0 +1,42 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a);
+#endif
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                   torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && a_meta.dim() == 2 && a_nzs.dim() == 2);
+  TORCH_CHECK(a.size(0) == a_nzs.size(0) && a.size(0) == a_meta.size(0) &&
+              a_nzs.size(1) * 2 == a.size(1) &&
+              a_meta.size(1) * 2 * 4 == a.size(1));
+  // Considering elemsPerMetaElem = 8b / 2b_per_nz = 4
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && a_nzs.stride(1) == 1 &&
+              a_meta.stride(1) == 1);  // Row-major
+  TORCH_CHECK(a.stride(0) % 8 == 0);   // 8 Byte Alignment for Compression
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
new file mode 100644
index 0000000000000..b50e9a3a2c240
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -0,0 +1,303 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#include "sparse_scaled_mm_c3x.cuh"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& bt_nzs,
+                                    torch::Tensor const& bt_meta,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  using Cutlass3xGemm1 =
+      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm2 =
+      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm3 =
+      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm4 =
+      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm5 =
+      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm6 =
+      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm7 =
+      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm8 =
+      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = bt_nzs.size(0);
+  uint32_t const m = a.size(0);  // Batch size
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096 || n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    if (n == 6144 || n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+
+  // Otherwise the default heuristic
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // n in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // n in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // n in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  TORCH_CHECK(a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
+                                            EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {  // a.dtype() == torch::kBFloat16
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales);
+  }
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..10178b53f4af0
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -0,0 +1,496 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
+   for NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_sparse_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc = AccType;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
+          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Interface stride expected from the argument a (will get transposed)
+  // We compute C^T = B^T * A^T, but we assume B is transposed before
+  // compression and hence the bt_* naming
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  auto layout_A = make_cute_layout<StrideA>(a, "A");
+  auto layout_D = make_cute_layout<StrideD>(out, "D");
+
+  // Transpose A and D
+  // A doesn't need to be transposed since cutlass expects a NxK matrix
+  // for B (which is At)
+  auto stride_At = layout_A.stride();
+  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{
+      static_cast<int>(bt_nzs.size(0)), static_cast<int>(size<0>(layout_A)),
+      static_cast<int>(size<1>(layout_A)), 1};
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{
+      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, stride_Dt, c_ptr, stride_Dt};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default {};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<half_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule,
+                             float>;
+};
+
+//////////////////////// Cherry-Picking Kernels ////////////////////////
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_1 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_2 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_3 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_4 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_5 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_6 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_7 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_8 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _128>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+////////////////////////////////////////////////////////////////////////
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<int8_t, OutType, Epilogue> {
+  // For M > 128 and any N
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
new file mode 100644
index 0000000000000..4c930b603c9e4
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -0,0 +1,59 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias);
+#endif
+
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
+              c.stride(1) == 1);            // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
+                                  bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 1ffab14862fed..88a4e60c75cbe 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -321,6 +321,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
+  ops.def(
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
+      "                         Tensor bt_nzs,"
+      "                         Tensor bt_meta, Tensor a_scales,"
+      "                         Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
+
+  // CUTLASS sparse matrix compressor
+  ops.def(
+      "cutlass_sparse_compress_entry(Tensor! a_nzs, Tensor! a_meta,"
+      "                              Tensor a) -> bool");
+  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/pyproject.toml b/pyproject.toml
index c5a14ecf5aea9..45fa4bff4e680 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ exclude = [
 ]
 
 [tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile"
+ignore-words-list = "dout, te, indicies, subtile, ElementE"
 skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 
 [tool.isort]
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..34244a8fe4ca7
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,131 @@
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Optional, Tuple, Type
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 26add5bf6d90d..21fec990aa873 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,9 +10,11 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensors24, CompressedTensorsLinearMethod,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(
@@ -208,3 +210,98 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+    assert qkv_proj.scheme.input_quant.strategy == input_strategy
+    assert qkv_proj.scheme.quantized
+    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
+     "token"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+     "channel", "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+     "channel", "token"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.int8
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+        assert qkv_proj.scheme.weight_quant is None
+        assert qkv_proj.scheme.input_quant is None
+        assert not qkv_proj.scheme.quantized
+        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+        assert sparsity_map.get("Linear").format == "dense"
+        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 2afffb5b9d1c8..a06956ce18a93 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,6 +21,8 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index a4d0c44c22b51..693128640e07d 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -26,6 +26,10 @@ do
     export QUANTIZATION=${array[0]}
     export MODEL_NAME=${array[1]}
     export REVISION=${array[2]}
+    # If array length is larger than 3, then MIN_CAPABILITY is provided
+    if [ ${#array[@]} -gt 3 ]; then
+        export MIN_CAPABILITY=${array[3]}
+    fi
     pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
 
     if [[ $LOCAL_SUCCESS == 0 ]]; then
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index d8bca05e204c0..199731bdc21fe 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,14 +1,21 @@
 import os
 
+import pytest
 import torch
 
+from vllm.platforms import current_platform
+
 MAX_MODEL_LEN = 1024
 MODEL_NAME = os.environ.get("MODEL_NAME",
                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
 
 
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
+    reason="Current system does not have minimum capability.")
 def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d6002630ee02c..f6b5514f8987d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -552,6 +552,109 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+
+    m = a.shape[0]
+    k = a.shape[1]
+    assert (k % 2 == 0)
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
+                         dtype=torch.uint8,
+                         device=a.device)
+
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
+        raise ValueError
+
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+
+    return a_nzs, a_meta
+
+
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+
+    return out
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4f5758a42dbbc..0c1fc18228f5c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,9 @@
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, Literal, Optional, cast
 
 import torch
-from compressed_tensors.config import CompressionFormat
+from compressed_tensors.config import (CompressionFormat,
+                                       SparsityCompressionConfig,
+                                       SparsityStructure)
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
@@ -15,7 +17,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
@@ -27,20 +29,29 @@
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self,
-                 target_scheme_map: Dict[str, Any],
-                 ignore: List[str],
-                 quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        config: Optional[Dict[str, Any]] = None,
+    ):
 
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -78,8 +89,50 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            config=config,
+        )
+
+    @classmethod
+    def _sparsity_scheme_map_from_config(
+            cls, config: Dict[str,
+                              Any]) -> Dict[str, SparsityCompressionConfig]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            sparsity compression configurations
+        """
+        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
+            return dict()
+
+        sparsity_config = SparsityCompressionConfig.model_validate(
+            sparsity_config)
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparsity_config.targets or list()
+        }
+        return sparse_scheme_map
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
         target_scheme_map: Dict[str, Any] = dict()
-        ignore = cast(List[str], config.get("ignore"))
         quant_format = cast(str, config.get("format"))
 
         # The quant_config has multiple config_groups, each containing
@@ -90,12 +143,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for _, quant_config in config["config_groups"].items():
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
                 target_scheme_map[target][
-                    "weights"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -110,13 +165,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                             "weights"].type == QuantizationType.FLOAT
                     else:
                         target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.parse_obj(
+                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
                                 quant_config.get("input_activations"))
-
-        return cls(target_scheme_map=target_scheme_map,
-                   ignore=ignore,
-                   quant_format=quant_format,
-                   kv_cache_scheme=config.get("kv_cache_scheme"))
+        return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -315,23 +366,105 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
-        matched_target = find_matched_target(
-            layer_name=layer_name,
-            module=layer,
-            targets=self.target_scheme_map.keys())
 
-        # Find the quant_scheme
-        scheme_dict = self.target_scheme_map[matched_target]
-        scheme = self._get_scheme_from_parts(
-            weight_quant=scheme_dict["weights"],
-            input_quant=scheme_dict["input_activations"])
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys())
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+        elif self.sparsity_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.sparsity_scheme_map.keys())
+            weight_quant = None
+            input_quant = None
+
+        # For models with sparsity, assumes that the sparse layers are also
+        # quantized for cutlass 2:4 support
+        sparsity_scheme: Optional[
+            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
+                matched_target)
+
+        if self.supports_cutlass_24(weight_quant=weight_quant,
+                                    input_quant=input_quant,
+                                    sparsity_scheme=sparsity_scheme):
+            # Have a valid sparsity scheme
+            # Validate layer is supported by Cutlass 2:4 Kernel
+            scheme = CompressedTensors24(quantized=weight_quant is not None
+                                         or input_quant is not None,
+                                         weight_quant=weight_quant,
+                                         input_quant=input_quant)
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
-
         return scheme
 
+    @staticmethod
+    def supports_cutlass_24(
+            weight_quant: Optional[QuantizationArgs],
+            input_quant: Optional[QuantizationArgs],
+            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        is_valid_sparsity = (sparsity_scheme is not None
+                             and sparsity_scheme.sparsity_structure
+                             == SparsityStructure.TWO_FOUR.value
+                             and sparsity_scheme.format == "dense")
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 5d259ec72051c..569ecaa6f5a76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -7,13 +7,12 @@
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
 
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
+
 __all__ = [
-    "CompressedTensorsScheme",
-    "CompressedTensorsWNA16",
-    "CompressedTensorsW8A16Fp8",
-    "CompressedTensorsW4A16Sparse24",
-    "CompressedTensorsW8A8Int8",
-    "CompressedTensorsW8A8Fp8",
-    "WNA16_SUPPORTED_BITS",
-    "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000000000..af266769aef89
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,203 @@
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensors24"]
+
+
+class CompressedTensors24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 quantized: bool = False,
+                 weight_quant: Optional[QuantizationArgs] = None,
+                 input_quant: Optional[QuantizationArgs] = None):
+
+        self.quantized = quantized
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Only cutlass 3.x kernels are implemented so far
+        return 90
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        self.output_dtype = params_dtype
+        layer.logical_widths = output_partition_sizes
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=self.weights_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader)
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader)
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(data=torch.empty(
+                    1, dtype=torch.float32),
+                                                weight_loader=weight_loader)
+
+                layer.register_parameter("input_scale", input_scale)
+
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+
+        layer.register_parameter("weight", weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Compress weights after loading. Store compressed weight and meta
+            tensor
+        
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
+        :param layer: The layer with the weights to be processed
+        
+        """
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths),
+                                                        requires_grad=False)
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4 
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed 
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer 
+        """
+        if self.quantized:
+            scale = None
+            if hasattr(layer, "input_scale"):
+                scale = layer.input_scale
+
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                if scale is not None:
+                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+                else:
+                    q_input, input_scale = ops.scaled_fp8_quant(
+                        x, use_per_token_if_dynamic=True)
+
+        else:
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
+
+        out = ops.cutlass_scaled_sparse_mm(a=q_input,
+                                           bt_nzs=layer.weight,
+                                           bt_meta=layer.meta,
+                                           scale_a=input_scale,
+                                           scale_b=layer.weight_scale,
+                                           out_dtype=self.output_dtype,
+                                           bias=bias)
+        assert out.is_contiguous()
+        return out
+
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+
+        if (self.weight_quant.type == QuantizationType.FLOAT
+                and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+
+        if (self.weight_quant.type == QuantizationType.INT
+                and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+
+        raise ValueError("Quantization type not supported by Cutlass")
+
+
+def check_24(tensor):
+    new_tensor = tensor.view(-1, 4)
+    zero_counts = (new_tensor == 0).sum(dim=1)
+    return (zero_counts >= 2).all().item()